# Look at the genotyped data to be cleaned getwd() # Check the heterozygosity of chr X sex <- read.table("sex.sexcheck", header=T) hist(sex$F, xlab="Probablity chrX alleles are from a single ancestor", main = "chrX Heterozygosity") boxplot(split(sex$F, sex$PEDSEX)) # Heterozygosity rate compared to individual missingness (people missing too many SNPs) # Unusual heterozygosity rates can be observed when there was insufficient genotyping on an individual # Removing those without sufficient genotyping has different potential implications than removing becuase of heterozygosity rates # More observed homozygosity than expected or a low heterozygosity rate might be interpreted as inbreeding. # The .05 for missingness and 3SDs for het rate used here are arbitrary, but fairly standard in the literature het <- read.table("het.imiss.txt", head=TRUE) names(het) hist(het$prop_HET, xlab="Heterozygosity rate", ylab="Frequency", main="") # Sample call rate / individuals missing genotyped data hist(het$F_MISS,xlab="Proportion of individuals missing SNPs", main ='Sample Call Rate') # Check how many individuals have a call rate less tha .95 (or 5% missing genotype data) length(which(het$F_MISS>.05)) # Plot heterozygosity by individual missingness plot(het$F_MISS,het$prop_HET,xlab="Individual missingness", ylab="Heterozygosity rate") abline(v=.05,col="red") mean(het$prop_HET) sd(het$prop_HET) mean(het$prop_HET)-3*sd(het$prop_HET) mean(het$prop_HET)+3*sd(het$prop_HET) abline(h=c(mean(het$prop_HET)-3*sd(het$prop_HET),mean(het$prop_HET)+3*sd(het$prop_HET)),col="blue") # Genotyping Call Rate / proportion of data missing for each SNP miss<-read.table(file="miss.lmiss", header=TRUE) hist(miss$F_MISS,xlab="Proportion of SNPs missing", main="Genotyping Call Rate") # MAF maf <- read.table("maf.frq", header =TRUE) hist(maf$MAF,xlab = "MAF", main ="", breaks = seq(0,.5,.01)) # Heterozygosity after genotype cleaning is completed het2 <- read.table("het2.imiss.txt", head=TRUE) hist(het2$prop_HET, xlab="Heterozygosity rate", ylab="Frequency", main="") hist(het2$F_MISS,xlab="Proportion of the sample missing SNPs", main ='Individual Missingness') plot(het2$F_MISS,het2$prop_HET,xlab="Individual Missingness", ylab="Heterozygosity Rate") mean(het2$prop_HET) sd(het2$prop_HET) mean(het2$prop_HET)-3*sd(het2$prop_HET) mean(het2$prop_HET)+3*sd(het2$prop_HET) abline(h=c(mean(het2$prop_HET)-3*sd(het2$prop_HET),mean(het2$prop_HET)+3*sd(het2$prop_HET)),col="blue")