# Visualise data in the Quality Control practical. # Check your working directory getwd() # Set your working directory to be the folder for this practical setwd('/home/katrina/practice') # The step corresponds to those in the tutorial instructions #### Step 2.4 #### # Check the heterozygosity of chromosome X sex <- read.table("sex.sexcheck", header=T) head(sex) hist(sex$F, xlab="Probablity chrX alleles are from a single ancestor", main = "chrX Heterozygosity") boxplot(split(sex$F, sex$PEDSEX)) #### Step 3.2 #### # Sample Call Rate: individuals missing genotyped data missI<-read.table(file="miss.imiss", header=TRUE) head(missI) max(missI$F_MISS) h1 = hist(missI$F_MISS, breaks=seq(0,.1,.005)) c1 = ifelse(h1$breaks<0.05, "grey", "red")[-length(h1$breaks)] plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='Sample Call Rate') # Check how many individuals have a call rate less than .95 (or 5% missing genetic data) length(which(missI$F_MISS>.05)) #### Step 4.1.1 #### # Genotype Call Rate: SNPs missing information from too many people missL<-read.table(file="miss.lmiss", header=TRUE) head(missL) max(missL$F_MISS) h2 = hist(missL$F_MISS, breaks=seq(0,.4,.005)) c2 = ifelse(h2$breaks<0.05, "grey", "red")[-length(h2$breaks)] plot(h2, col=c2, xlab="Proportion of missing data: SNPs", main ='Genotype Call Rate') # How many SNPs are missing more than 5% of data length(which(missL$F_MISS>.05)) # BUT since we will impute data, we will be able to re-acquire this information # And out imputation will be better quality if we don't use poor quality data for the imputation # We want to maximize our sample size at this point # So we will clean out the poor quality SNPs first, and then the individuals later #### 4.2.2 #### # Minor Allele Frequency maf <- read.table("maf.frq", header =TRUE) head(maf) h3 = hist(maf$MAF, breaks=seq(0,.5,.005)) c3 = ifelse(h3$breaks>0.01, "grey", "red")[-length(h3$breaks)] plot(h3, col=c3, xlab="Minor allele frequency", main ='MAF') # Number of variants with a MAF < 0.01 length(which(maf$MAF<0.01)) #### 5.1.2 #### # Check the sample call rate after cleaning variants miss2I<-read.table(file="miss2.imiss", header=TRUE) head(miss2I) # Check how many individuals have a call rate less than .95 (or 5% missing genetic data) length(which(miss2I$F_MISS>.05)) h4 = hist(miss2I$F_MISS, breaks=seq(0,.1,.005)) c4=ifelse(h4$breaks<0.05, "grey", "red")[-length(h4$breaks)] plot(h4, col=c4, xlab="Proportion of missing data: individuals", main ='AFTER') # Compare with the plot from before cleaning the variants plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='BEFORE') #### 9.8 #### # Plot the MDS library(calibrate) mds.cluster = read.table("cc.mds.mds", header=T, stringsAsFactors = F); # The script assumes that the ancestry labels in the FID column are always the same. # ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI # Since our FID is all the same for our sample we can check th table(mds.cluster$FID) # Assign colours mds.cluster$color<-"red" mds.cluster$color[which(mds.cluster$FID=="ASW")]<-"darkolivegreen" mds.cluster$color[which(mds.cluster$FID=="CEU")]<-"lightblue" mds.cluster$color[which(mds.cluster$FID=="CHB")]<-"brown" mds.cluster$color[which(mds.cluster$FID=="CHD")]<-"orange" mds.cluster$color[which(mds.cluster$FID=="GIH")]<-"black" mds.cluster$color[which(mds.cluster$FID=="JPT")]<-"purple" mds.cluster$color[which(mds.cluster$FID=="LWK")]<-"magenta" mds.cluster$color[which(mds.cluster$FID=="MEX")]<-"grey50" mds.cluster$color[which(mds.cluster$FID=="MKK")]<-"darkblue" mds.cluster$color[which(mds.cluster$FID=="TSI")]<-"green" mds.cluster$color[which(mds.cluster$FID=="YRI")]<-"yellow" # Different shapes for the points mds.cluster$pchtype<-17 mds.cluster$pchtype[which(mds.cluster$color=="red")]<-19 mds.cluster.sample<-mds.cluster[which(mds.cluster$color=="red"),] # Plot points plot(mds.cluster$C2, mds.cluster$C1, col=mds.cluster$color, ylab="Dimension 1", xlab="Dimension 2",pch=mds.cluster$pchtype) # Add legend legend("bottomright", c("My Sample", "CEU", "CHB", "YRI", "TSI", "JPT", "CHD", "MEX", "GIH", "ASW","LWK", "MKK"), fill=c("red", "lightblue", "brown", "yellow", "green", "purple", "orange", "grey50", "black", "darkolivegreen", "magenta", "darkblue")) # Add labels for individuals from sample (this requires calibrate) # Can be useful for identifying outliers textxy(mds.cluster.sample$C2, mds.cluster.sample$C1,mds.cluster.sample$IID)