# Visualise data in the Quality Control practical.

# Check your working directory
getwd()

# Set your working directory to be the folder for this practical
setwd('/home/katrina/practice')

# The step corresponds to those in the tutorial instructions

#### Step 2.4 ####
# Check the heterozygosity of chromosome X

sex <- read.table("sex.sexcheck", header=T)
head(sex)
hist(sex$F, xlab="Probablity chrX alleles are from a single ancestor", 
     main = "chrX Heterozygosity")
boxplot(split(sex$F, sex$PEDSEX))


#### Step 3.2 ####
# Sample Call Rate: individuals missing genotyped data

missI<-read.table(file="miss.imiss", header=TRUE)
head(missI)

max(missI$F_MISS)
h1 = hist(missI$F_MISS, breaks=seq(0,.1,.005))
c1 = ifelse(h1$breaks<0.05, "grey", "red")[-length(h1$breaks)]
plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='Sample Call Rate')

# Check how many individuals have a call rate less than .95 (or 5% missing genetic data)
length(which(missI$F_MISS>.05))


#### Step 4.1.1 ####
# Genotype Call Rate: SNPs missing information from too many people

missL<-read.table(file="miss.lmiss", header=TRUE)
head(missL)

max(missL$F_MISS)
h2 = hist(missL$F_MISS, breaks=seq(0,.4,.005))
c2 = ifelse(h2$breaks<0.05, "grey", "red")[-length(h2$breaks)]
plot(h2, col=c2, xlab="Proportion of missing data: SNPs", main ='Genotype Call Rate')

# How many SNPs are missing more than 5% of data
length(which(missL$F_MISS>.05))

# BUT since we will impute data, we will be able to re-acquire this information
# And out imputation will be better quality if we don't use poor quality data for the imputation
# We want to maximize our sample size at this point
# So we will clean out the poor quality SNPs first, and then the individuals later


#### 4.2.2 ####
# Minor Allele Frequency

maf <- read.table("maf.frq", header =TRUE)
head(maf)

h3 = hist(maf$MAF, breaks=seq(0,.5,.005))
c3 = ifelse(h3$breaks>0.01, "grey", "red")[-length(h3$breaks)]
plot(h3, col=c3, xlab="Minor allele frequency", main ='MAF')

# Number of variants with a MAF < 0.01
length(which(maf$MAF<0.01))

#### 5.1.2 ####
# Check the sample call rate after cleaning variants

miss2I<-read.table(file="miss2.imiss", header=TRUE)
head(miss2I)

# Check how many individuals have a call rate less than .95 (or 5% missing genetic data)
length(which(miss2I$F_MISS>.05))

h4 = hist(miss2I$F_MISS, breaks=seq(0,.1,.005))
c4=ifelse(h4$breaks<0.05, "grey", "red")[-length(h4$breaks)]
plot(h4, col=c4, xlab="Proportion of missing data: individuals", main ='AFTER')

# Compare with the plot from before cleaning the variants
plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='BEFORE')


#### 9.8 ####
# Plot the MDS
library(calibrate)

mds.cluster = read.table("cc.mds.mds", header=T, stringsAsFactors = F);

# The script assumes that the ancestry labels in the FID column are always the same.
# ASW   CEU   CHB   CHD   GIH   JPT   LWK  MEX   MKK   TSI   YRI
# Since our FID is all the same for our sample we can check th
table(mds.cluster$FID)

# Assign colours
mds.cluster$color<-"red"
mds.cluster$color[which(mds.cluster$FID=="ASW")]<-"darkolivegreen"
mds.cluster$color[which(mds.cluster$FID=="CEU")]<-"lightblue"
mds.cluster$color[which(mds.cluster$FID=="CHB")]<-"brown"
mds.cluster$color[which(mds.cluster$FID=="CHD")]<-"orange"
mds.cluster$color[which(mds.cluster$FID=="GIH")]<-"black"
mds.cluster$color[which(mds.cluster$FID=="JPT")]<-"purple"
mds.cluster$color[which(mds.cluster$FID=="LWK")]<-"magenta"
mds.cluster$color[which(mds.cluster$FID=="MEX")]<-"grey50"
mds.cluster$color[which(mds.cluster$FID=="MKK")]<-"darkblue"
mds.cluster$color[which(mds.cluster$FID=="TSI")]<-"green"
mds.cluster$color[which(mds.cluster$FID=="YRI")]<-"yellow"

# Different shapes for the points
mds.cluster$pchtype<-17
mds.cluster$pchtype[which(mds.cluster$color=="red")]<-19

mds.cluster.sample<-mds.cluster[which(mds.cluster$color=="red"),]

# Plot points
plot(mds.cluster$C2, mds.cluster$C1, col=mds.cluster$color, ylab="Dimension 1", xlab="Dimension 2",pch=mds.cluster$pchtype)

# Add legend
legend("bottomright", c("My Sample", "CEU", "CHB", "YRI", "TSI", "JPT", "CHD", "MEX", "GIH", "ASW","LWK", "MKK"), fill=c("red", "lightblue", "brown", "yellow", "green", "purple", "orange", "grey50", "black", "darkolivegreen", "magenta", "darkblue"))

# Add labels for individuals from sample (this requires calibrate)
# Can be useful for identifying outliers
textxy(mds.cluster.sample$C2, mds.cluster.sample$C1,mds.cluster.sample$IID)