Differences

This shows you the differences between two versions of the page.

--- homework_6_ancestry [2015/11/13 15:14] – scott
+++ homework_6_ancestry [2015/11/16 10:18] (current) – scott
@@ Line 1: / Line 1: @@
+==== Notes ====
+==== Code for Homework ====
 # PSYC 7102 -- Statistical Genetics
- # Homework #6: Ancestry
+# Homework #6: Ancestry
- # Due: November 25th
+# Due: November 25th
- # PLEASE CONTACT DAVID TO GET YOUR NEW VCF.
- # Overview: Using your variant calls contained in your VCF files,
- # combined with the existing 1000 Genomes variant calls from the 1000
- # Genomes project, you will conduct a principal components analysis
- # of the 1000 Genomes samples using PLINK and estimate your position in
- # the 1000G PCA space
- # This homework is a little bit different because the commands are
- # slightly complicated, so I've written them all for you. In addition
- # to specific conceptual questions, I ask you to run the commands
- # below (changing directories appropriately as needed) and then
- # explain to me what the command did and why we did it. I don't need
- # super technical details but simply a solid conceptual overview of
- # the purpose of the command.
- ### ONLY EXPLAIN COMMANDS WHERE I SPECIFICALLY REQUEST IT! YOU DO NOT
- ### HAVE TO EXPLAIN EVERY COMMAND!
- # For many questions you'll want to run analyses by chromosome. To do
- #  this, please create an interactive PBS session like this:
- qsub -I -l walltime=23:00:00 -q short -l mem=10gb -l nodes=1:ppn=22
- ###############
- ### STEP 1  ###
- ###############
- # Here I take a minute to describe the various files you'll use to get started
- #    The 1000 Genomes vcfs from the 1000 Genomes Consortium
- /Users/scvr9332/1000Gp3/variant_calls/ALL.chr[1-22].phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
- #    A vcf file with, depending on your level of comfort, your own
- #    genotypes or 1kg sample HG000096. All examples that follow use
- #    HG00096 for convenience, which can be obtained as follows. To use
- #    your own sample just replace the $10 with $NF, assuming your
- #    sample is in the last column of the vcf.
-    (zcat chr1.filtered.PASS.beagled.vcf.gz          | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr2.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr3.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr4.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr5.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr6.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr7.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr8.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr9.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr10.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr11.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr12.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr13.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr14.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr15.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr16.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr17.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr18.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr19.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr20.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr21.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
-    zgrep -v '#' chr22.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}') | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.vcf.gz
- ### Add in rsIDs from dbSNP. PLINK needs these to reconcile
- ### positions/alleles
- vcf-add-rsid -vcf chrALL.filtered.PASS.beagled.HG00096.vcf.gz -db /Users/scvr9332/reference_data/dbsnp.144.b37.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz
- ### The previous command keeps only variants with rsIDs, otherwise
- ### plink throws an error that there are >1 variants with ID = "."
- ###
- ###------ QUESTION 1: WHAT DOES THIS COMMAND DO?
- zgrep -E '#|\srs[0-9]' chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz
- ##############
- ### STEP 2 ###
- ##############
- ### Whole bunch of data cleaning. Retain in the 1000 Genomes
- ###   Consortium VCFs only biallelic SNPs that also exist in your vcf
- ###   files. PLINK 1.9 can't handle all variant types gracefully.
- #  Get list of your SNPs
- zcat chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz | cut -f3 | grep '^rs' > MyrsIDs.txt
- #
- #  Reduce the 1000 Genomes VCFs to bi-allelic SNPs (that's all PLINK
- #  can handle anyway). BE SURE TO LOG INTO A COMPUTE NODE BEFORE
- #  RUNNING ANY OF THESE LOOPS!!!!!
- ###------ QUESTION 2: WHAT DOES THIS COMMAND (THE ENTIRE FOR LOOP) DO?
- for i in {1..22}; do
-     zgrep -E '^#|\s[ACTG]\s[ACTG]\s' /Users/scvr9332/1000Gp3/variant_calls /ALL.chr$i.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > chr$i.1000G.biallelic.vcf.gz &
- done
- ### Retain in the 1000 Genomes VCF only your SNPs that are also fairly common
- ###
- ###------ QUESTION 3: WHY WOULD WE REMOVE COMMON SNPS, OTHER THAN IT
- ###------             MAKES EVERY COMMAND LATER FASTER?
- for i in {1..22}; do
-     plink --vcf chr$i.1000G.biallelic.vcf.gz \
-         --extract MyrsIDs.txt \
-         --maf .05 \
-         --make-bed \
-         --out chr$i.1000G.biallelic.common.plink &
- done
- ### Remove any remaining duplicates from the 1000G reference files
- ### A) Identify duplicate sites
- ###
- ###------ QUESTION 4: WALK ME THROUGH WHAT THIS COMMAND (THE ENTIRE
- ###------             FOR LOOP) IS DOING.
- for i in {1..22}; do
-     zcat chr$i.1000G.biallelic.vcf.gz | cut -f3 | sort | uniq -c | grep -E '\s2\s' | grep '^rs' | gawk '{print $2}' >  dups$i.txt &
- done
- ### B) Remove the duplicate sites
- for i in {1..22}; do
-     plink --bfile chr$i.1000G.biallelic.common.plink \
-         --exclude dups$i.txt \
-         --make-bed \
-         --out chr$i.1000G.biallelic.common.plink.nodups &
- done
- rm chr*.1000G.biallelic.vcf.gz
- rm chr*.1000G.biallelic.common.plink.bed
- rm chr*.1000G.biallelic.common.plink.bim
- rm chr*.1000G.biallelic.common.plink.fam
- ## Identify variants in LD
- for i in {1..22}; do
-     plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
-         --indep-pairwise 100k 5 .01 \  #------ QUESTION 5: EXPLAIN WHAT THIS ARGUMENT IS DOING
-         --maf .05 \
-         --out chr$i.snpsToExtract &
- done
- ## Extract LD-independent variants
- for i in {1..22}; do
-     plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
-         --extract chr$i.snpsToExtract.prune.in \
-         --make-bed \
-         --out chr$i.1000G.biallelic.common.plink.nodups.LDpruned &
- done
- rm chr*.1000G.biallelic.common.plink.nodups.bed
- rm chr*.1000G.biallelic.common.plink.nodups.bim
- rm chr*.1000G.biallelic.common.plink.nodups.fam
- rm dups*
- ##############
- ### STEP 3 ###
- ##############
- ### Merge the new and improved 1000 Genomes plink files into a single
-###  file with all chromosomes. This is easier to do when they're vcf
-###  files
-#  Example Command:
+# PLEASE CONTACT DAVID TO GET YOUR NEW VCF.
+# Overview: Using your variant calls contained in your VCF files,
+# combined with the existing 1000 Genomes variant calls from the 1000
+# Genomes project, you will conduct a principal components analysis
+# of the 1000 Genomes samples using PLINK and estimate your position in
+# the 1000G PCA space
+# This homework is a little bit different because the commands are
+# slightly complicated, so I've written them all for you. In addition
+# to specific conceptual questions, I ask you to run the commands
+# below (changing directories appropriately as needed) and then
+# explain to me what the command did and why we did it. I don't need
+# super technical details but simply a solid conceptual overview of
+# the purpose of the command.
+### ONLY EXPLAIN COMMANDS WHERE I SPECIFICALLY REQUEST IT! YOU DO NOT
+### HAVE TO EXPLAIN EVERY COMMAND! But please run all commands to produce
+### in the end a PCA plot containing yourself compared to all 1000 Genomes
+### samples.
+# For many questions you'll want to run analyses by chromosome. To do
+#  this, please create an interactive PBS session like this:
+qsub -I -l walltime=23:00:00 -q short -l mem=10gb -l nodes=1:ppn=22
+### Load apigenome, plink
+module load apigenome_0.0.2
+module load plink_latest
+module load tabix_0.2.6
+###############
+### STEP 1  ###
+###############
+# Here I take a minute to describe the various files you'll use to get started
+#    The 1000 Genomes vcfs from the 1000 Genomes Consortium
+/Users/scvr9332/1000Gp3/variant_calls/ALL.chr[1-22].phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
+#    A vcf file with, depending on your level of comfort, your own
+#    genotypes or 1kg sample HG000096. All examples that follow use
+#    HG00096 for convenience, which can be obtained as follows. To use
+#    your own sample just replace the $10 with $NF, assuming your
+#    sample is in the last column of the vcf.
+   (zcat chr1.filtered.PASS.beagled.vcf.gz          | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr2.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr3.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr4.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr5.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr6.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr7.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr8.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr9.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr10.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr11.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr12.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr13.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr14.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr15.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr16.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr17.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr18.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr19.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr20.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr21.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+   zgrep -v '#' chr22.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}') | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.vcf.gz
+### Add in rsIDs from dbSNP. PLINK needs these to reconcile
+### positions/alleles
+vcf-add-rsid -vcf chrALL.filtered.PASS.beagled.HG00096.vcf.gz -db /Users/scvr9332/reference_data/dbsnp_144/dbsnp.144.b37.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz
+### The previous command keeps only variants with rsIDs, otherwise
+### plink throws an error that there are >1 variants with ID = "."
+###
+###------ QUESTION 1: WHAT DOES THIS COMMAND DO? (1 point)
+zgrep -E '#|\srs[0-9]' chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz
+##############
+### STEP 2 ###
+##############
+### Whole bunch of data cleaning. Retain in the 1000 Genomes
+###   Consortium VCFs only biallelic SNPs that also exist in your vcf
+###   files. PLINK 1.9 can't handle all variant types gracefully.
+#  Get list of your SNPs
+zcat chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz | cut -f3 | grep '^rs' > MyrsIDs.txt
+#
+#  Reduce the 1000 Genomes VCFs to bi-allelic SNPs (that's all PLINK
+#  can handle anyway). BE SURE TO LOG INTO A COMPUTE NODE BEFORE
+#  RUNNING ANY OF THESE LOOPS!!!!!
+###
+###------ QUESTION 2: WHAT DOES THIS COMMAND (THE ENTIRE FOR LOOP) DO? (2 points)
+for i in {1..22}; do
+    zgrep -E '^#|\s[ACTG]\s[ACTG]\s' /Users/scvr9332/1000Gp3/variant_calls/ALL.chr$i.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > chr$i.1000G.biallelic.vcf.gz &
+done
+### Retain in the 1000 Genomes VCF only your SNPs that are also fairly common
+### because we're going to conduct PCA on these SNPs and only want common ones.
+###
+###------ QUESTION 3: WHY WOULD WE RETAIN ONLY COMMON SNPS, OTHER THAN IT
+###------             MAKES EVERY COMMAND LATER FASTER? (2 points)
+for i in {1..22}; do
+    plink --vcf chr$i.1000G.biallelic.vcf.gz \
+        --extract MyrsIDs.txt \
+        --maf .05 \
+        --make-bed \
+        --out chr$i.1000G.biallelic.common.plink &
+done
+### Remove any remaining duplicates from the 1000G reference files
+### A) Identify duplicate sites
+###
+###------ QUESTION 4: WALK ME THROUGH WHAT THIS COMMAND (THE ENTIRE
+###------             FOR LOOP) IS DOING. (3 points)
+for i in {1..22}; do
+    zcat chr$i.1000G.biallelic.vcf.gz | cut -f3 | sort | uniq -c | grep -E '\s2\s' | grep '^rs' | gawk '{print $2}' >  dups$i.txt &
+done
+### B) Remove the duplicate sites
+for i in {1..22}; do
+    plink --bfile chr$i.1000G.biallelic.common.plink \
+        --exclude dups$i.txt \
+        --make-bed \
+        --out chr$i.1000G.biallelic.common.plink.nodups &
+done
+rm chr*.1000G.biallelic.vcf.gz
+rm chr*.1000G.biallelic.common.plink.bed
+rm chr*.1000G.biallelic.common.plink.bim
+rm chr*.1000G.biallelic.common.plink.fam
+## Identify variants in LD
+for i in {1..22}; do
+    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
+        --indep-pairwise 100k 5 .01 \  #------ QUESTION 5: EXPLAIN WHAT THIS ARGUMENT IS DOING (1 point)
+        --maf .05 \
+        --out chr$i.snpsToExtract &
+done
+## Extract LD-independent variants
+for i in {1..22}; do
+    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
+        --extract chr$i.snpsToExtract.prune.in \
+        --make-bed \
+        --out chr$i.1000G.biallelic.common.plink.nodups.LDpruned &
+done
+rm chr*.1000G.biallelic.common.plink.nodups.bed
+rm chr*.1000G.biallelic.common.plink.nodups.bim
+rm chr*.1000G.biallelic.common.plink.nodups.fam
+rm dups*
+##############
+### STEP 3 ###
+##############
+### Merge the new and improved 1000 Genomes plink files into a single
+### file with all chromosomes. This is easier to do when they're vcf
+### files
+### Example Command:
 for i in {1..22}; do
     plink --bfile chr$i.1000G.biallelic.common.plink.nodups.LDpruned --recode-vcf --out tmp$i &
@@ Line 172: / Line 186: @@
 ###----- QUESTION 6: HOW MANY VARIANTS ARE IN THE OUTPUT FILE FROM THE
-###-----             COMMAND ABOVE?
+###-----             COMMAND ABOVE? (1 point)
@@ Line 215: / Line 229: @@
 ### average for each site in the score file.
 ###
-###------ QUESTION 7: EXPLAIN WHAT THE FOLLOWING COMMANDS ARE DOING
+###------ QUESTION 7: EXPLAIN WHAT THE FOLLOWING COMMANDS ARE DOING (2 points)
 plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 4 header --out PC1
 plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 5 header --out PC2
@@ Line 241: / Line 255: @@
 R
 ### Then in this R session run: 'install.packages('car', repos='http://cran.r-project.org')'
-### If you log out and back in, you'll have to again run this command:
+### If you log out and back in, you'll have to again run this command again to get the car library loaded:
 export R_LIBS=/Users/scvr9332/R
@@ Line 247: / Line 261: @@
 ###------ QUESTION 8: PLEASE INTERPRET THE RESULTING GRAPH. WHAT IS
 ###------             BEING PLOTTED? WHAT DOES IT SAY ABOUT THE
-###------             ANCESTRY OF YOUR SAMPLE (OR WHATEVER
+###------             ANCESTRY OF YOUR SAMPLE (OR WHATEVER SAMPLE YOU
+###------             USED) (2 points)
 echo "
@@ Line 257: / Line 272: @@
 ### 'Self-reported' ancestry of 1000g participants
-kg_sf <- read.table('20130502.sequence.index', header=T, sep='\t', fill=T, stringsAsFactors=F)
+kg_sf <- read.table('/Users/scvr9332/PCA/20130502.sequence.index', header=T, sep='\t', fill=T, stringsAsFactors=F)
 sample_ids <- unique(data.frame(IID=kg_sf$SAMPLE_NAME, POPULATION=kg_sf$POPULATION, stringsAsFactors=F))