Notes

Code for Homework

# PSYC 7102 – Statistical Genetics # Homework #6: Ancestry # Due: November 25th

# PLEASE CONTACT DAVID TO GET YOUR NEW VCF.

# Overview: Using your variant calls contained in your VCF files, # combined with the existing 1000 Genomes variant calls from the 1000 # Genomes project, you will conduct a principal components analysis # of the 1000 Genomes samples using PLINK and estimate your position in # the 1000G PCA space

# This homework is a little bit different because the commands are # slightly complicated, so I've written them all for you. In addition # to specific conceptual questions, I ask you to run the commands # below (changing directories appropriately as needed) and then # explain to me what the command did and why we did it. I don't need # super technical details but simply a solid conceptual overview of # the purpose of the command.

### ONLY EXPLAIN COMMANDS WHERE I SPECIFICALLY REQUEST IT! YOU DO NOT ### HAVE TO EXPLAIN EVERY COMMAND! But please run all commands to produce ### in the end a PCA plot containing yourself compared to all 1000 Genomes ### samples.

# For many questions you'll want to run analyses by chromosome. To do # this, please create an interactive PBS session like this:

qsub -I -l walltime=23:00:00 -q short -l mem=10gb -l nodes=1:ppn=22

### Load apigenome, plink module load apigenome_0.0.2 module load plink_latest module load tabix_0.2.6

############### ### STEP 1 ### ############### # Here I take a minute to describe the various files you'll use to get started

# The 1000 Genomes vcfs from the 1000 Genomes Consortium /Users/scvr9332/1000Gp3/variant_calls/ALL.chr[1-22].phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz

# A vcf file with, depending on your level of comfort, your own # genotypes or 1kg sample HG000096. All examples that follow use # HG00096 for convenience, which can be obtained as follows. To use # your own sample just replace the $10 with $NF, assuming your # sample is in the last column of the vcf.

 (zcat chr1.filtered.PASS.beagled.vcf.gz          | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr2.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr3.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr4.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr5.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr6.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr7.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr8.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr9.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr10.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr11.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr12.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr13.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr14.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr15.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr16.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr17.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr18.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr19.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr20.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr21.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
 zgrep -v '#' chr22.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}') | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.vcf.gz

### Add in rsIDs from dbSNP. PLINK needs these to reconcile ### positions/alleles vcf-add-rsid -vcf chrALL.filtered.PASS.beagled.HG00096.vcf.gz -db /Users/scvr9332/reference_data/dbsnp_144/dbsnp.144.b37.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz ### The previous command keeps only variants with rsIDs, otherwise ### plink throws an error that there are >1 variants with ID = “.” ### ###—— QUESTION 1: WHAT DOES THIS COMMAND DO? (1 point) zgrep -E '#|\srs[0-9]' chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz

############## ### STEP 2 ### ############## ### Whole bunch of data cleaning. Retain in the 1000 Genomes ### Consortium VCFs only biallelic SNPs that also exist in your vcf ### files. PLINK 1.9 can't handle all variant types gracefully.

# Get list of your SNPs zcat chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz | cut -f3 | grep '^rs' > MyrsIDs.txt # # Reduce the 1000 Genomes VCFs to bi-allelic SNPs (that's all PLINK # can handle anyway). BE SURE TO LOG INTO A COMPUTE NODE BEFORE # RUNNING ANY OF THESE LOOPS!!!!! ### ###—— QUESTION 2: WHAT DOES THIS COMMAND (THE ENTIRE FOR LOOP) DO? (2 points) for i in {1..22}; do

  zgrep -E '^#|\s[ACTG]\s[ACTG]\s' /Users/scvr9332/1000Gp3/variant_calls/ALL.chr$i.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > chr$i.1000G.biallelic.vcf.gz &

done

### Retain in the 1000 Genomes VCF only your SNPs that are also fairly common ### because we're going to conduct PCA on these SNPs and only want common ones. ### ###—— QUESTION 3: WHY WOULD WE RETAIN ONLY COMMON SNPS, OTHER THAN IT ###—— MAKES EVERY COMMAND LATER FASTER? (2 points) for i in {1..22}; do

  plink --vcf chr$i.1000G.biallelic.vcf.gz \
      --extract MyrsIDs.txt \
      --maf .05 \
      --make-bed \
      --out chr$i.1000G.biallelic.common.plink &

done

### Remove any remaining duplicates from the 1000G reference files ### A) Identify duplicate sites ### ###—— QUESTION 4: WALK ME THROUGH WHAT THIS COMMAND (THE ENTIRE ###—— FOR LOOP) IS DOING. (3 points) for i in {1..22}; do

  zcat chr$i.1000G.biallelic.vcf.gz | cut -f3 | sort | uniq -c | grep -E '\s2\s' | grep '^rs' | gawk '{print $2}' >  dups$i.txt & 

done

### B) Remove the duplicate sites for i in {1..22}; do

  plink --bfile chr$i.1000G.biallelic.common.plink \
      --exclude dups$i.txt \
      --make-bed \
      --out chr$i.1000G.biallelic.common.plink.nodups &

done

rm chr*.1000G.biallelic.vcf.gz rm chr*.1000G.biallelic.common.plink.bed rm chr*.1000G.biallelic.common.plink.bim rm chr*.1000G.biallelic.common.plink.fam

## Identify variants in LD for i in {1..22}; do

  plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
      --indep-pairwise 100k 5 .01 \  #------ QUESTION 5: EXPLAIN WHAT THIS ARGUMENT IS DOING (1 point)
      --maf .05 \
      --out chr$i.snpsToExtract &

done

## Extract LD-independent variants for i in {1..22}; do

  plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
      --extract chr$i.snpsToExtract.prune.in \
      --make-bed \
      --out chr$i.1000G.biallelic.common.plink.nodups.LDpruned &

done

rm chr*.1000G.biallelic.common.plink.nodups.bed rm chr*.1000G.biallelic.common.plink.nodups.bim rm chr*.1000G.biallelic.common.plink.nodups.fam rm dups*

############## ### STEP 3 ### ############## ### Merge the new and improved 1000 Genomes plink files into a single ### file with all chromosomes. This is easier to do when they're vcf ### files ### Example Command: for i in {1..22}; do

  plink --bfile chr$i.1000G.biallelic.common.plink.nodups.LDpruned --recode-vcf --out tmp$i & 

done

(cat tmp1.vcf; grep -v '#' tmp2.vcf; grep -v '#' tmp3.vcf; grep -v '#' tmp4.vcf; grep -v '#' tmp5.vcf; \

  grep -v '#' tmp6.vcf; grep -v '#' tmp7.vcf; grep -v '#' tmp8.vcf; grep -v '#' tmp9.vcf; grep -v '#' tmp10.vcf; \
  grep -v '#' tmp11.vcf; grep -v '#' tmp12.vcf; grep -v '#' tmp13.vcf; grep -v '#' tmp14.vcf; grep -v '#' tmp15.vcf; \
  grep -v '#' tmp16.vcf; grep -v '#' tmp17.vcf; grep -v '#' tmp18.vcf; grep -v '#' tmp19.vcf; grep -v '#' tmp20.vcf; \
  grep -v '#' tmp21.vcf; grep -v '#' tmp22.vcf) | bgzip -c > chrALL.1000G.biallelic.common.plink.nodups.LDpruned.vcf.gz

###—– QUESTION 6: HOW MANY VARIANTS ARE IN THE OUTPUT FILE FROM THE ###—– COMMAND ABOVE? (1 point)

plink –vcf chrALL.1000G.biallelic.common.plink.nodups.LDpruned.vcf.gz \

  1. -make-bed \
  2. -out chrALL.1000G.biallelic.common.plink.nodups.LDpruned

rm tmp* rm chr[0-9]*.bed rm chr[0-9]*.bim rm chr[0-9]*.fam rm chr[0-9]*.nosex rm chr[0-9]*.log rm chr[0-9]*.prune.in rm chr[0-9]*.prune.out

############## ### STEP 4 ### ############## # Conduct a PCA on these new and improved 1000 Genomes plink files. plink –bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned –pca var-wts

# Use R to organize the results from the PCA into a file that can then be # imported into PLINK using the –score command echo “### Extract per - SNP weights for each of the first 20 PCs and get the ### correct effect allele for later scoring ### PCA ← read.table('plink.eigenvec.var', header = F, col.names = c('CHROM','RS',paste('PC',1:20,sep=))) bim ← read.table('chrALL.1000G.biallelic.common.plink.nodups.LDpruned.bim', header = F, col.names = c('CHROM','RS','unknown','POS','A1','A2')) x ← merge (PCA, bim) write.table(subset(x, select = c('RS','A1', paste('PC', 1:20, sep = ))) , file = 'PCA-score-file.txt', quote=F) ### End: extract per - SNP weights for each of the first 20 ” | R –vanilla

############## ### STEP 5 ### ############## ### Score yourself (or 1000 Genomes sample HG00096) on the PCs

### Calculate the score for each person by taking the PC-weighted ### average for each site in the score file. ### ###—— QUESTION 7: EXPLAIN WHAT THE FOLLOWING COMMANDS ARE DOING (2 points) plink –vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz –score PCA-score-file.txt 2 3 4 header –out PC1 plink –vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz –score PCA-score-file.txt 2 3 5 header –out PC2 plink –vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz –score PCA-score-file.txt 2 3 6 header –out PC3 plink –vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz –score PCA-score-file.txt 2 3 7 header –out PC4

plink –bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned –score PCA-score-file.txt 2 3 4 header –out PC1.1kg plink –bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned –score PCA-score-file.txt 2 3 5 header –out PC2.1kg plink –bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned –score PCA-score-file.txt 2 3 6 header –out PC3.1kg plink –bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned –score PCA-score-file.txt 2 3 7 header –out PC4.1kg

############## ### STEP 6 ### ############## ### Plot the results! ### ### To run the following command you'll have to install the R package ### 'car'. To do this on vieques please run the following. ONly need ### to run it once

mkdir /Users/YourUniqueName/R export R_LIBS=/Users/scvr9332/R ## replace my unique name with yours module load R_3.2.2 R ### Then in this R session run: 'install.packages('car', repos='http://cran.r-project.org')' ### If you log out and back in, you'll have to again run this command again to get the car library loaded: export R_LIBS=/Users/scvr9332/R

###—— QUESTION 8: PLEASE INTERPRET THE RESULTING GRAPH. WHAT IS ###—— BEING PLOTTED? WHAT DOES IT SAY ABOUT THE ###—— ANCESTRY OF YOUR SAMPLE (OR WHATEVER SAMPLE YOU ###—— USED) (2 points) echo “

### Read in scores for PC1 and PC2 in 1000g kg_PC1 ← read.table('PC1.1kg.profile', header=T) kg_PC2 ← read.table('PC2.1kg.profile', header=T) kg_PC3 ← read.table('PC3.1kg.profile', header=T) kg_PC4 ← read.table('PC4.1kg.profile', header=T)

### 'Self-reported' ancestry of 1000g participants kg_sf ← read.table('/Users/scvr9332/PCA/20130502.sequence.index', header=T, sep='\t', fill=T, stringsAsFactors=F)

sample_ids ← unique(data.frame(IID=kg_sf$SAMPLE_NAME, POPULATION=kg_sf$POPULATION, stringsAsFactors=F))

sample_ids$CONTINENT ← ifelse(sample_ids$POPULATION=='CHB' |

                             sample_ids$POPULATION=='JPT' |
                             sample_ids$POPULATION=='CHS' |
                             sample_ids$POPULATION=='CDX' |
                             sample_ids$POPULATION=='KHV' |
                             sample_ids$POPULATION=='CHD', 'EAS', sample_ids$POPULATION)

sample_ids$CONTINENT ← ifelse(sample_ids$CONTINENT=='CEU' |

                             sample_ids$CONTINENT=='TSI' |
                             sample_ids$CONTINENT=='GBR' |
                             sample_ids$CONTINENT=='FIN' |
                             sample_ids$CONTINENT=='IBS', 'EUR', sample_ids$CONTINENT)

sample_ids$CONTINENT ← ifelse(sample_ids$CONTINENT=='YRI' |

                             sample_ids$CONTINENT=='LWK' |
                             sample_ids$CONTINENT=='GWD' |
                             sample_ids$CONTINENT=='MSL' |
                             sample_ids$CONTINENT=='ESN', 'AFR', sample_ids$CONTINENT)

sample_ids$CONTINENT ← ifelse(sample_ids$CONTINENT=='ASW' |

                             sample_ids$CONTINENT=='ACB' |
                             sample_ids$CONTINENT=='MXL' |
                             sample_ids$CONTINENT=='PUR' |
                             sample_ids$CONTINENT=='CLM' |
                             sample_ids$CONTINENT=='PEL', 'ADM', sample_ids$CONTINENT)

sample_ids$CONTINENT ← ifelse(sample_ids$CONTINENT=='GIH' |

                             sample_ids$CONTINENT=='PJL' |
                             sample_ids$CONTINENT=='BEB' |
                             sample_ids$CONTINENT=='STU' |
                             sample_ids$CONTINENT=='ITU', 'SAS', sample_ids$CONTINENT)

sample_ids$CONTINENT_numeric ← ifelse(sample_ids$CONTINENT == 'EAS', 1, sample_ids$CONTINENT) sample_ids$CONTINENT_numeric ← ifelse(sample_ids$CONTINENT == 'EUR', 1, sample_ids$CONTINENT_numeric) sample_ids$CONTINENT_numeric ← ifelse(sample_ids$CONTINENT == 'AFR', 1, sample_ids$CONTINENT_numeric) sample_ids$CONTINENT_numeric ← ifelse(sample_ids$CONTINENT == 'ADM', 1, sample_ids$CONTINENT_numeric) sample_ids$CONTINENT_numeric ← ifelse(sample_ids$CONTINENT == 'SAS', 1, sample_ids$CONTINENT_numeric)

kg_PC1 ← merge(kg_PC1, sample_ids)

My_PC1 ← read.table('PC1.profile', header=T) My_PC2 ← read.table('PC2.profile', header=T) My_PC3 ← read.table('PC3.profile', header=T) My_PC4 ← read.table('PC4.profile', header=T)

kg ← data.frame(PC1 = -kg_PC1$SCORE,

               PC2 = -kg_PC2$SCORE,
               PC3 = -kg_PC3$SCORE,
               PC4 = -kg_PC4$SCORE,
               ancestry = kg_PC1$CONTINENT)

all ← rbind(kg, data.frame(PC1=-My_PC1$SCORE,

                          PC2=-My_PC2$SCORE,
                          PC3=-My_PC3$SCORE,
                          PC4=-My_PC4$SCORE,
                          ancestry=rep('Me', nrow(My_PC1))))

### Scatterplot matrix library(car) pdf('ancestry_Scatterplot.pdf') scatterplotMatrix(~PC1+PC2+PC3+PC4 | ancestry, data=all, smoother=FALSE, reg.line=FALSE, cex=.7) dev.off() ” | R –vanilla