This practical runs on the workshop RStudio server.
First login to RStudio
https://workshop.colorado.edu/rstudio/
Copy the Practical document into your home directory using the
following R
commands.
setwd("~/.")
system("mkdir -p Day1")
system("cp /home/practicals/1.4.PrincipalComponentAnalysis_LoicYengo/final/Practical_PCA.html ~/Day1/.")
## Now copy the data
system("mkdir -p Day1/pca_prac")
system("cp /home/practicals/1.4.PrincipalComponentAnalysis_LoicYengo/final/data/1kg_* ~/Day1/pca_prac/.")
system("cd Day1/pca_prac")
setwd("Day1/pca_prac")
In this practical we use data from the 1000 Genomes Project (Phase 3). The dataset consists of 2,504 individuals from 26 diverse populations grouped into 5 super-population (AFR: African; AMR: Admixed American; EAS: East-Asian; EUR: European; SAS: South-Asian). More details about the 1000 Genomes data can be found at https://www.internationalgenome.org/. For the practical, we will QC genotypes for 80,244 SNPs.
Check that all the files are copied correctly using
R
system("ls -lt")
or directly via the terminal
ls -lt
## total 52004
## -rw-r----- 1 loic workshop 55156 Feb 22 17:33 1kg_pop2504.txt
## -rw-r--r-- 1 loic workshop 62600 Feb 22 17:33 1kg_hm3_qced.fam
## -rw-r--r-- 1 loic workshop 2233112 Feb 22 17:33 1kg_hm3_qced.bim
## -rw-r--r-- 1 loic workshop 50232747 Feb 22 17:33 1kg_hm3_qced.bed
## -rw-r--r-- 1 loic workshop 9056 Feb 22 17:33 Practical_PCA_solutions.Rmd
## -rw-r--r-- 1 loic workshop 634196 Feb 22 17:32 Practical_PCA.html
## -rw-r--r-- 1 loic workshop 8973 Feb 22 17:32 Practical_PCA.Rmd
We now read the labels for each individual in the data.
labels <- read.table("1kg_pop2504.txt",h=T)
head(labels)
## sample pop super_pop gender
## 1 HG00096 GBR EUR male
## 2 HG00097 GBR EUR female
## 3 HG00099 GBR EUR female
## 4 HG00100 GBR EUR female
## 5 HG00101 GBR EUR male
## 6 HG00102 GBR EUR female
Run the following PLINK command (called from R
, i.e.,
ghe actual command is everything between
system(\" PLINK COMMAND \")
) to calculate 10 PCs from the
set of genotypes stored in the 1kg_hm3_qced.*
files.
system("plink --bfile 1kg_hm3_qced --pca 10 --out myFirstPCA")
Now let’s read and plot the results in R
## Read the results
pca_res <- read.table("myFirstPCA.eigenvec")
colnames(pca_res) <- c("FID","IID",paste0("PC",1:10))
## Read eigen values
eigenvals <- read.table("myFirstPCA.eigenval")[,1]
## Check the results
head(pca_res)
## FID IID PC1 PC2 PC3 PC4 PC5
## 1 HG00096 HG00096 -0.01068150 0.0274031 0.0112437 0.0185781 0.003008380
## 2 HG00097 HG00097 -0.01020950 0.0272527 0.0101893 0.0182465 0.000564962
## 3 HG00099 HG00099 -0.01064670 0.0267610 0.0104280 0.0186655 0.003793260
## 4 HG00100 HG00100 -0.00976738 0.0274966 0.0104824 0.0182644 0.003200500
## 5 HG00101 HG00101 -0.00997834 0.0267219 0.0114768 0.0175850 -0.000158383
## 6 HG00102 HG00102 -0.01038660 0.0269949 0.0108896 0.0180554 0.004403280
## PC6 PC7 PC8 PC9 PC10
## 1 -0.01018480 -5.10416e-05 -0.006787770 0.005072490 0.003387190
## 2 -0.00791965 2.66261e-03 -0.003773080 -0.000732431 0.017153400
## 3 -0.01705030 -2.77909e-03 -0.006008120 -0.007061080 -0.005176360
## 4 -0.00921438 4.83589e-03 -0.003273350 0.002378980 -0.008730690
## 5 -0.01454070 -1.43326e-03 0.000818631 0.001848400 -0.003016550
## 6 -0.01550600 -2.69826e-03 0.003362760 0.001638180 -0.000102151
npc <- 10
for(k in 1:npc){
pca_res[,paste0("PC",k)] <- pca_res[,paste0("PC",k)] * sqrt(eigenvals[k])
}
## Add labels
pca_res <- merge(pca_res,labels,by.x="IID",by.y="sample")
plot(pca_res[,"PC1"],pca_res[,"PC2"],pch=19,axes=FALSE,xlab="PC1",ylab="PC2",col=1,main="")
axis(1);axis(2)
abline(h=0,v=0,col="grey",lty=2)
Question 1a. How many (ancestry) clusters do you see?
Question 1b. What is the ancestry of someone with a PC1 value of 0? Same question for a PC2 value 0.
Let’s use k-means approach to determine the clusters
ncluster <- 5
cluster1_2 <- kmeans(pca_res[,c("PC1","PC2")],ncluster,nstart = 10)$cluster
plot(pca_res[,"PC1"],pca_res[,"PC2"],pch=19,axes=FALSE,xlab="PC1",ylab="PC2",col=cluster1_2,main="")
axis(1);axis(2)
abline(h=0,v=0,col="grey",lty=2)
Now let’s use the geographic labels…
Colors <- c("dodgerblue","coral1","goldenrod","seagreen4","purple3")
Pops <- names(table(labels$super_pop))
names(Colors) <- Pops
plot(pca_res[,"PC1"],pca_res[,"PC2"],pch=19,axes=FALSE,xlab="PC1",ylab="PC2",col=Colors[pca_res[,"super_pop"]],main="")
axis(1);axis(2)
abline(h=0,v=0,col="grey",lty=2)
legend("topright",legend=Pops,fill=Colors,box.lty=0)
Question 2. Are the k-means clusters concordant with geographical labels? If not then how can you improve concordance?
To answer the question you can run the R
code below
## Credit to Museful (stackoverflow)
permutations <- function(n){
if(n==1){
return(matrix(1))
} else {
sp <- permutations(n-1)
p <- nrow(sp)
A <- matrix(nrow=n*p,ncol=n)
for(i in 1:n){
A[(i-1)*p+1:p,] <- cbind(i,sp+(sp>=i))
}
return(A)
}
}
concordance <- function(x,y){
tab <- table(x,y)
n <- nrow(tab)
m <- ncol(tab)
if(n != m){
cat("Error: Number of clusters must be the same for x and y.")
return(NULL)
}
perms <- permutations(n)
conc <- sapply(1:nrow(perms),function(i) sum( diag(tab[perms[i,],]) ) ) / sum(tab)
return(max(conc))
}
concordance(x=cluster1_2,y=pca_res[,"super_pop"])
## [1] 0.9021565
Let’s now visualize for PCs…
for(j in 1:4){
for(k in (j+1):5){
plot(pca_res[,paste0("PC",j)],pca_res[,paste0("PC",k)],pch=19,axes=FALSE,
xlab=paste0("PC",j),ylab=paste0("PC",k),col=Colors[pca_res[,"super_pop"]],
main=paste0("PC",j," vs PC",k))
axis(1);axis(2)
abline(h=0,v=0,col="grey",lty=2)
}
}
So let’s redefine k-mean clusters with more PCs…
ncluster <- 5
cluster1_5 <- kmeans(pca_res[,paste0("PC",1:5)],ncluster,nstart = 10)$cluster
plot(pca_res[,"PC1"],pca_res[,"PC2"],pch=19,axes=FALSE,xlab="PC1",ylab="PC2",col=cluster1_5,main="")
axis(1);axis(2)
abline(h=0,v=0,col="grey",lty=2)
## Test concordance again
concordance(x=cluster1_5,y=pca_res[,"super_pop"])
## [1] 0.9185304
Question 3. Can we improve concordance between PC-based clusters and geographical labels when using more than 5 PCs?
Question 4. What can you conclude overall?
In this part of the practical, we will calculate PCs without AMR
samples, then project back to the PCA map. We will use the software tool
GCTA
. This procedure has multiple steps. First, step run
PCA in without AMR samples.
We first create a list of ID for AMR samples
## Create a list AMR sample
write.table(pca_res[which(pca_res[,"super_pop"]=="AMR"),c("FID","IID")],"AMR.id",quote=F,col.names=F,row.names=F)
## Check the file
system("head AMR.id")
Step 1. Run PCA without AMR samples…
system("gcta64 --bfile 1kg_hm3_qced --remove AMR.id --make-grm --out myGRM_noAMR")
system("gcta64 --grm myGRM_noAMR --pca 2 --out myPCA_noAMR")
Step 2. Calculate SNP loadings…
system("gcta64 --bfile 1kg_hm3_qced --remove AMR.id --pc-loading myPCA_noAMR --out myPCA_noAMR_snp_loading")
Check that the loading file named
myPCA_noAMR_snp_loading.pcl
has been created. You can use
R
system("head myPCA_noAMR_snp_loading.pcl")
or the terminal
head myPCA_noAMR_snp_loading.pcl
## SNP A1 A2 mu pc1_loading pc2_loading
## rs12562034 A G 0.416783 -1.36682e-06 -1.97552e-06
## rs1806509 A C 1.00417 -1.49507e-06 4.65569e-07
## rs9442398 A G 0.566528 1.54655e-06 8.66995e-07
## rs9442373 C A 0.891516 4.60931e-07 -3.46409e-07
## rs4442317 C T 0.566991 3.34991e-07 5.51761e-07
## rs11260542 G A 0.31618 1.25266e-06 -7.04417e-07
## rs6662635 G A 0.115438 1.24524e-06 -3.35032e-07
## rs2477782 T C 0.922114 -1.77782e-06 4.9467e-06
## rs1571150 C A 0.932777 -8.794e-07 1.69551e-06
Step 3. Project AMR samples…
system("gcta64 --bfile 1kg_hm3_qced --keep AMR.id --project-loading myPCA_noAMR_snp_loading 2 --out AMR_PCA")
Check that the loading file named AMR_PCA.proj.eigenvec
has been created. You can use R
system("head AMR_PCA.proj.eigenvec")
or using the terminal
head AMR_PCA.proj.eigenvec
## HG00551 HG00551 -0.00693757 0.0199995
## HG00553 HG00553 -0.00313082 0.0151895
## HG00554 HG00554 -0.00724912 0.0220428
## HG00637 HG00637 -0.0039525 0.0187811
## HG00638 HG00638 -0.00317372 0.0172754
## HG00640 HG00640 -0.00744436 0.0216936
## HG00641 HG00641 -0.00749357 0.0195837
## HG00731 HG00731 -0.00914593 0.0243925
## HG00732 HG00732 -0.00825651 0.0212402
## HG00734 HG00734 -0.00623617 0.0178016
Let’s now load the results in R
and visualize them.
ref <- read.table("myPCA_noAMR.eigenvec",stringsAsFactors=FALSE)
amr <- read.table("AMR_PCA.proj.eigenvec",stringsAsFactors=FALSE)
colnames(ref) <- colnames(amr) <- c("FID","IID","PC1","PC2")
## Read eigenvalues
eigenvals <- read.table("myPCA_noAMR.eigenval")[,1]
npc <- 2
for(k in 1:npc){
ref[,paste0("PC",k)] <- ref[,paste0("PC",k)] * sqrt(eigenvals[k])
amr[,paste0("PC",k)] <- amr[,paste0("PC",k)] * sqrt(eigenvals[k])
}
ref <- merge(ref,labels,by.x="IID",by.y="sample")
plot(ref[,"PC1"],ref[,"PC2"],pch=19,axes=FALSE,xlab="PC1",ylab="PC2",col=Colors[ref[,"super_pop"]],main="")
axis(1);axis(2)
abline(h=0,v=0,col="grey",lty=2)
legend("topright",legend=Pops,fill=Colors,box.lty=0)
## Plot AMR projections
points(amr[,"PC1"],amr[,"PC2"],col=Colors["AMR"],pch=17)
Question 5. Compare PC coordinates of AMR samples when they are included versus excluded from PCA
comparison <- merge(pca_res[,c("IID","PC1","PC2")],amr[,c("IID","PC1","PC2")],by="IID")
apply(comparison[,-1],2,mean)
## PC1.x PC2.x PC1.y PC2.y
## -0.10549471 0.08336128 -0.10072258 0.09287991
apply(comparison[,-1],2,sd)
## PC1.x PC2.x PC1.y PC2.y
## 0.06173315 0.09644813 0.05517969 0.09548338
cor(comparison[,"PC1.x"],comparison[,"PC1.y"])
## [1] 0.996955
cor(comparison[,"PC2.x"],comparison[,"PC2.y"])
## [1] 0.9998951
Question 6. Assign an ancestry cluster to the AMR samples
clus_centers <- aggregate(cbind(PC1,PC2)~super_pop,data=ref,FUN=mean)
## Distance calculate distance to each cluster
Distances <- matrix(NA,nrow=nrow(amr),ncol=4)
for(i in 1:nrow(amr)){
for(k in 1:4){
Distances[i,k] <- sum( ( as.numeric(amr[i,c("PC1","PC2")]) - as.numeric(clus_centers[k,-1]) )^2 )
}
}
clus_amr <- clus_centers[apply(Distances,1,which.min),1]
table(clus_amr)
## clus_amr
## AFR EAS EUR SAS
## 3 2 76 266