#!/usr/bin/env Rscript # ==== QC_steps.R ==== # Visualise and summarize PLINK output in the Quality Control practical. # Check your working directory getwd() # If not there already, set your working directory to be the folder for this practical setwd('~/practicals/1.2.DataGeneration+QC_AleshaHatton/final/QCdata') dir() ## check the contents of the directory ## ==== Practical - R plotting Outline ==== # 2.3 Plotting X chromosome heterozygosity in R # 3.2 Plotting sample call rate in R # 4.1 Plot SNP call rate in R # 4.6 Plot the sample allele frequency spectrum in R # 5.2 Plotting pre and post QC call rates in R ## ==== END Practical - R plotting Outline ==== # 2.3 Plotting X chromosome heterozygosity in R # Check the heterozygosity of chromosome X # read in PLINK sexcheck file sex <- read.table("sex.sexcheck", header=T) # view contents head(sex) # Histogram hist(sex$F, xlab="Probability chrX alleles are from a single ancestor", main = "chrX Heterozygosity") # Boxplot split by reported sex boxplot(split(sex$F, sex$PEDSEX), main = "Checking reported/genotype sex discrepancies", xlab="Reported sex", ylab="chrX Heterozygosity") # 3.2 Plotting sample call rate in R # Sample Call Rate: individuals missing genotyped data # read in PLINK imiss file missI <- read.table(file="miss.imiss", header=TRUE) # view contents head(missI) # Histogram of F_MISS h1 = hist(missI$F_MISS, breaks=seq(0,.1,.005)) c1 = ifelse(h1$breaks<0.05, "grey", "red")[-length(h1$breaks)] plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='Sample Call Rate') # look at highest missing SNP frequency max(missI$F_MISS) # Look at top 20 samples with the highest missing SNP frequency head(missI[order(missI$F_MISS,decreasing=T),],20) # Check how many individuals have a call rate less than .95 (or 5% missing genetic data) sum(missI$F_MISS > .05) # 4.1 Plot SNP call rate in R # Genotype Call Rate: SNPs missing information from too many people # read in PLINK lmiss file missL<-read.table(file="miss.lmiss", header=TRUE) # view contents head(missL) # Histogram of F_MISS h2 = hist(missL$F_MISS, breaks=seq(0,.4,.005)) c2 = ifelse(h2$breaks<0.05, "grey", "red")[-length(h2$breaks)] plot(h2, col=c2, xlab="Proportion of missing data: SNPs", main ='Genotype Call Rate') # How many SNPs are missing more than 5% of data sum(missL$F_MISS > .05) # NOTE: It is best to be conservative with SNP call rate filtering, # as imputation will provide many more SNPs than those lost here. # Our imputation will be better quality if we restrict to the highest quality SNPs for imputation # We want to maximize our sample size at this point # So we will clean out the poor quality SNPs first, and then individuals later # 4.6 Plot the sample allele frequency spectrum in R # read in PLINK .frq file maf <- read.table("maf.frq", header =TRUE) # view contents head(maf) ## Histogram h3 = hist(maf$MAF, breaks=seq(0,.5,.005)) c3 = ifelse(h3$breaks>0.01, "grey", "red")[-length(h3$breaks)] plot(h3, col=c3, xlab="Minor allele frequency", main ='MAF') # Number of variants with a MAF < 0.01 sum(maf$MAF < 0.01) # 5.2 Plotting pre and post QC call rates in R # read in PLINK .imiss file miss2I<-read.table(file="miss2.imiss", header=TRUE) # view contents head(miss2I) ## Histogram h4 = hist(miss2I$F_MISS, breaks=seq(0,.1,.005)) c4=ifelse(h4$breaks<0.05, "grey", "red")[-length(h4$breaks)] par(mfrow=c(2,1)) ## Plot both graphs plot(h4, col=c4, xlab="Proportion of missing data: individuals", main ='AFTER QC') # Compare with the plot from before cleaning the variants plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='BEFORE QC') par(mfrow=c(1,1)) ## reset plotting # Check how many individuals have a call rate less than .95 after variant QC (or 5% missing genetic data) sum(miss2I$F_MISS > .05)