########################################### # Data ########################################### ## VCF exome data from 1000 Genomes # GBR British in England and Scotland # TSI Toscani in Italia # LWK Luhya in Webuye, Kenya # ASW Americans of African Ancestry in SW USA ########################################### # Section 1: Setting up and viewing data ########################################### ## make PSEQpractical directory mkdir PSEQpractical ## change directory cd PSEQpractical ## copy directory contents cp /faculty/douglas/PSEQpractical/* ./ ## symbolic link to resources ln -s /opt/pseq/resources resources ## symbolic link to vcf ln -s /opt/pseq/vcf vcf ## Make sure pseq runs pseq ## test help menus pseq help pseq help views ## view VCF (type q to exit less) zless -S vcf/GBR.vcf.gz pseq vcf/GBR.vcf.gz meta-summary | less -S ## view VCF using pseq pseq vcf/GBR.vcf.gz v-view | less -S ## view resource databases ## LOCDB: RefSeq gene transcripts (e.g. from UCSC, or HGNC website) pseq . loc-summary --locdb resources/locdb ## SEQDB: this has been pre-made (from a FASTA downloaded from UCSC) pseq . seq-summary --seqdb resources/seqdb ## to view a stretch of reference sequence: pseq . seq-view --seqdb resources/seqdb --region chr22:24000000..24000010 ## create pseq project pseq lwk new-project --resources resources/ ## load VCF (will take about a minute to load) pseq lwk load-vcf --vcf vcf/LWK.vcf.gz ## look at pseq file using less less -S lwk.pseq ## viewing variant data (v-view, rv-view, mrv-view) # view all variants pseq lwk v-view | less -S ## view all non-reference variants with genotypes pseq lwk rv-view | less - ## add varianat level meta information pseq lwk rv-view --vmeta | less pseq lwk rv-view --vmeta --show DP | less ## add genotype level meta information pseq lwk rv-view --gmeta | less pseq lwk rv-view --gmeta --show GQ | less ## individual summary pseq lwk i-stats ## variant summary pseq lwk v-stats ########################################### # Section 2: Filtering and QC-ing data ########################################### ## Masks ## Get summary statistics for genomic regions # i-stats pseq lwk i-stats --mask reg=chr22 ## v-stats pseq lwk v-stats --mask reg=chr22:10000000-20000000 pseq lwk v-stats --mask reg=chr22 --stats gmean=GQ pseq lwk v-stats --mask reg=chr22 --stats count=DP:0-100,DP:200-1000,DP:1000-5000,DP:5000- mac=1,2-4,5-10,11- ## filtering in individuals pseq lwk mrv-view --mask indiv=NA19350 reg=chr22:10000000-20000000 ## filter out individuals pseq lwk v-stats --mask reg=chr17 indiv.ex=NA19350 ## filter out file of individuals pseq lwk i-view | grep -v \# | awk '{print $1}' | head -n 5 > indiv.remove pseq lwk v-stats --mask reg=chr17 indiv.ex=@indiv.remove ## filtering out any failing variants pseq lwk v-stats --mask reg=chr22 any.filter.ex ##show variant level meta information per variant in matrix form pseq lwk meta-matrix --show DP MQ | less ## filtering genotypes pseq lwk v-stats --mask reg=chr22 geno=GQ:ge:30 ## make a new VCF of all variants with at least one alternate allele assuming missing calls are reference (filtered, here only chr22) pseq lwk write-vcf --mask reg=chr22 assume-ref mac=1- > c22.vcf pseq c22 new-project --resources resources/ pseq c22 load-vcf --vcf c22.vcf ## gene-based summaries pseq c22 g-view --mask loc.group=refseq | less pseq c22 g-view --geno --mask gene=RAC2 # gene-based summaries # note, this will include intronic variants also # note: 'empty.group' mask, to force output for genes w/ no variants pseq lwk g-stats --mask loc.group=genes empty.group --locdb resources/locdb.genes | less ## Ancestry inference and relatedness less resources/panel.pfrq pseq lwk i-pop --file resources/panel.pfrq --out lwk ## Look at pairwise sharing (relatedness/contamination) pseq lwk ibs-matrix --mask mac=2-5 --long-format --two-counts ## variants unique to (or enriched in) a particular group # for the two individuals showing excess sharing (from ibs-matrix command, above) pseq lwk unique --indiv NA19334 NA19313 ########################################### # Section 3: Annotation ########################################### ## annotate all variants using refseq pseq c22 lookup --annotate refseq --mask no-geno no-vmeta > annot.txt ## let's look at the annotation output less annot.txt grep chr22:17469049 annot.txt grep chr22:39498038 annot.txt ## create file with nonsysonymous variants from the "worst" annotation category awk ' $2 == "worst" { print $1,$3 } ' annot.txt > worst.txt awk '{print $2}' worst.txt | sort | uniq -c awk ' $2 == "missense" || $2 == "nonsense" { print $1 } ' worst.txt > ns.txt awk ' $2 == "intronic" { print $1 } ' worst.txt > intronic.txt # filter only on particular annotations pseq c22 v-stats --mask ereg=@ns.txt ############################################ # Section 4: Variant/Gene based association ############################################ ## create chromosome 22 VCF files from two different datasets pseq vcf/GBR.vcf.gz write-vcf --mask reg=chr22 assume-ref mac=1- > gbr-c22.vcf pseq vcf/TSI.vcf.gz write-vcf --mask reg=chr22 assume-ref mac=1- > tsi-c22.vcf ## Create a combined pseq project pseq eur new-project --resources resources/ pseq eur load-vcf --vcf gbr-c22.vcf --vcf tsi-c22.vcf ## Look at dummy phenotype file (GBR as cases, TSI as controls) less pop.phe awk '{print $2}' pop.phe | sort | uniq -c ## load phenotype file pseq eur load-pheno --file pop.phe ## view individuals to see phenotypes have been loaded pseq eur i-view | head ## get allele counts per variant by phenotype pseq eur counts --pheno phe --mask limit=10 ## single-site associaton (fisher's exact test) pseq eur v-assoc --pheno phe > assoc.txt ## View single variant association less -S assoc.txt ## gene-based association pseq eur assoc --pheno phe --mask loc.group=refseq --out genebased awk ' $6 < 0.01 ' genebased.assoc grep -w NM_031890 genebased.assoc.det pseq eur rv-view --mask reg=chr22:17600497 --pheno phe --gmeta ## annotate eur chr22 pseq eur lookup --annotate refseq --mask no-geno no-vmeta > annot2.txt ## pull out nonsynonymous variants awk '$2 == "worst" && ($3 ~ "nonsense" || $3 ~ "missense") {print $1}' annot2.txt > ns2.txt ## burden test, using permutation, restricted to NS variants pseq eur assoc --pheno phe --mask loc.group=genes --locdb resources/locdb.genes --mask ereg.req=@ns2.txt --tests burden vt --out genebased2 awk ' $6 < 0.01 ' genebased2.assoc grep SHANK3 genebased2.assoc.det pseq eur rv-view --gmeta --mask ereg.req=@ns2.txt gene=SHANK3 ############################################ # Section 5: Gene set tests ############################################ # BURDEN tests; condition just on 'singletons' # this generates 1000 null datasets by permutation and stores all "BURDEN" # statistics (of a greater burden of rare mutations in GBR vs TSI, in this case) pseq eur assoc --phenotype phe --tests burden --perm 1000 --dump-null-matrix \ --mask loc.group=genes assume-ref mac=1-1 ereg.req=@ns.txt --locdb resources/locdb.genes --out burden-ns less -S burden.matrix # necessary auxiliary files for the set-test less gene.list less equiv.txt ## generic 'gene-set' less gene-families.set ## make alias alias smp="./smp" ## run the set-based tests: smp -m gene.list equiv.txt gene-families.set burden.matrix > set-gene-families.txt # any 'enriched' sets awk ' $1 == "SET" && $4 < 0.01 ' set-gene-families.txt ################################################# # Additional topics (Just examples, not tested!) ################################################# ##Creating varsets ## annotate all variants to be gene/allele specific (will take a long time to do whole genome) pseq eur lookup --annotate refseq --mask no-geno no-vmeta --worstByAlias symbol > annot.txt * create varsets awk ' $2 == "worst_by_alias_symbol_alt" { printf $1"\t"$3 } \ $2 == "worst_by_alias_symbol" { printf "\t"$3 } \ $2 == "alias_symbol" { printf "\t"$3"\n" } ' annot.txt | \ tr ',' '\t' | tr ' ' '\t' | awk -F"\t" ' { n=(NF-1)/3 ; for(i=0;i annot2genes ** disruptive egrep -w "(nonsense|esplice|frameshift)" annot2genes | tr '|' '\t' | awk ' { print $1,$3"|disruptive",$2,$5 } ' OFS="\t" > disruptive.varset awk ' { print "disruptive" , $2 } ' OFS="\t" disruptive.varset | sort | uniq > disruptive.varsuperset pseq c22 var-set --file ${FILES}/disruptive.varset pseq c22 var-superset --file ${FILES}/disruptive.varsuperset ## checks pseq $PROJ var-summary | less ## just pick first 10 people to remove pseq eur i-view | grep -v \# | awk '{print $1}' | head > indiv.remove ## can now use varsets to filter or perform more advanced operation like genic association pseq eur assoc --phenotype phe \ --mask varset.group=disruptive \ --mask mac=1-5 indiv.ex=@indiv.remove \ --mask any.filter.ex geno=GQ:ge:90,DP:ge:10 \ --out disruptive-mac1-5