R version 3.1.2 (2014-10-31) -- "Pumpkin Helmet" Copyright (C) 2014 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > PCs <- read.delim(file="dutch_1kG.R.evec",skip=1,head=FALSE,sep="") > > ## 1. plot ## > > # choose colors for plot > cls=as.character(PCs[,13]) > cls[cls=="3"]="#FFC125" # Dutch individuals > cls[cls=="4"]="#98F5FF" # CEPH individuals > cls[cls=="5"]="#6495ED" # British individuals from England and Scotland > cls[cls=="6"]="#B9D3EE" # HapMap Finnish individuals from Finland > cls[cls=="7"]="#EEEE00" # Iberian populations in Spain > cls[cls=="8"]="#8B7500" # Toscan individuals > cls[cls=="9"]="#CD3333" # Han Chinese in Beijing > cls[cls=="10"]="#8B2323" # Han Chinese South > cls[cls=="11"]="#FF7F50" # Japanese individuals > cls[cls=="12"]="#6B8E23" # Luhya individuals > cls[cls=="13"]="#BCEE68" # Yoruba individuals > > # plot PC1 & PC2 > pdf(file="1000 Genomes_PC1_PC2.pdf",width=9.5,height=6) > par(xpd=T, mar=par()$mar+c(0,0,0,20)) > plot(PCs[,4],PCs[,3],col=cls,type="p",pch=20,bty="l",xlab="PC 2 (from 1000 Genomes)",ylab="PC 1 (from 1000 Genomes)") > legend(.06,.03,bty="n",legend=c( + "Dutch individuals", + "CEPH individuals", + "British individuals from England and Scotland", + "HapMap Finnish individuals from Finland", + "Iberian populations in Spain", + "Toscan individuals", + "Han Chinese in Beijing", + "Han Chinese South", + "Japanese individuals", + "Yoruba individuals", + "Luhya individuals"), + col=c( + "#FFC125", + "#98F5FF", + "#6495ED", + "#B9D3EE", + "#EEEE00", + "#8B7500", + "#CD3333", + "#8B2323", + "#FF7F50", + "#BCEE68", + "#6B8E23"), + lty = c(-1), pch = c(20),lwd=c(-1)) > par(mar=c(5, 4, 4, 2) + 0.1) > dev.off() null device 1 > > ## 2. filter out Dutch individuals scoring higher or lower than maximum or minimum European values respectively ## > > # first, we export all Europen individuals from 1000 Genomes into a matrix called EUR: > EUR <- PCs[which(PCs$V13 < 9 & PCs$V13 > 3),] > > # then we make variables with the highest and lowest values of European individuals (for PC1 and PC2): > PC1max <- max(EUR$V3) > PC1min <- min(EUR$V3) > PC2max <- max(EUR$V4) > PC2min <- min(EUR$V4) > > # than we make a matrix called outliers with all Dutch individuals scoring higher or lower than those European maximum and minimum values: > outliers <- PCs[which((PCs$V13 == 3) & (PCs$V3 > PC1max | PCs$V3 < PC1min | PCs$V4 > PC2max | PCs$V4 < PC2min)),] > > # and finally we export the IDs of those Dutch individuals with non-European ancestry to a file outliers.txt > write.table(outliers[,1:2],file="outliers.txt",quote=FALSE,sep="\t",row.names=F,col.names=FALSE) > > proc.time() user system elapsed 0.300 0.152 0.293