############################################################################################## # # # Setting things up # # # ############################################################################################## mkdir pseq-practical cd pseq-practical cp /opt/pseq/data/exomes.tar.gz . tar -xzvf exome.tar.gz ## This is the efficient way to do it at the workshop ln -s /opt/pseq/resources ## Don't run these commands that start with "#" #mkdir resources #cp /opt/pseq/resources/* resources/ ## Move to working directory for this practical ## Check folder contents ls -lR ## Check pseq is installed correctly pseq pseq help pseq help views pseq help v-view ############################################################################################## # # # The exome sequence data (VCFs) # # # ############################################################################################## ## Get VCF exome data from 1000 Genomes FTP # GBR British in England and Scotland # TSI Toscani in Italia # LWK Luhya in Webuye, Kenya # cd vcf # wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110810_exome_consensus_snps/GBR.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz # wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110810_exome_consensus_snps/TSI.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz # wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110810_exome_consensus_snps/LWK.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz # cd .. ## Note: pre-processing step; these are distributed to have exactly the same number of rows (DONE) # for s in GBR TSI LWK ASW # do # zgrep ^# vcf/${s}.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz > vcf/${s}.vcf # gzcat vcf/${s}.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz | awk ' $9 == "GT:PL:GQ" ' >> vcf/${s}.vcf # gzip vcf/${s}.vcf # done ############################################################################################## # # # Looking at the VCF contents: variants, genotypes and meta-data # # # ############################################################################################## ## Quick look at the files # (press 'q' to quit) zless -S vcf/LWK.vcf.gz ## Use PSEQ to look at the VCF (will simply scan through variants) # the head command means only print first ten lines pseq vcf/LWK.vcf.gz v-view | head ## Look in more detail at first variant (and its meta-information) # use a 'mask' to limit the output to first two variants pseq vcf/LWK.vcf.gz v-view --vmeta --mask limit=2 ## subset of same information, in different, rectangular format pseq vcf/LWK.vcf.gz meta-matrix --mask limit=2 --show DP MQ INDEL ## remind me what the tags mean: pseq vcf/LWK.vcf.gz meta-summary ## looking at genotype (meta-)information pseq vcf/LWK.vcf.gz v-view --geno --gmeta --mask limit=2 ## output a filtered VCF (i.e. only include two variant tags, ## MQ and GQ, and genotypes for two individuals, for ## a 2Mb region on chromosome 7) pseq vcf/LWK.vcf.gz write-vcf --show MQ GQ --mask indiv=NA19472,NA19324 --mask reg=chr7:12000000..14000000 ############################################################################################## # # # Resource files # # # ############################################################################################## # cd resources # wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/locdb # wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/locdb.genes # wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/protdb # wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/seqdb # wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/refdb # wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/panel.tar.gz ## ## LOCDB: RefSeq gene transcripts (e.g. from UCSC, or HGNC website) ## # two LOCDBs: by transcripts (for annotation), and by genes (for gene-based tests) pseq . loc-summary --locdb resources/locdb pseq . loc-summary --locdb resources/locdb.genes # first built from GTF file: zless -S resources/refseq.gtf.gz # utility to look for 'rogue transcripts', typically indicative of problems aligning gene # ( CDS not mod 3; mapped to an unknown chromosome (not in SEQDB); maps to multiple chromosomes; # CDS too short; overlapping exons; inconsistent strand; etc ) pseq . gtf-check --file resources/refseq.gtf.gz --seqdb resources/seqdb # Make the LOCDB (already done, no need to repeat) # pseq . loc-load --file refseq.gtf.gz --locdb resources/locdb # Note, also added gene symbols with 'load-alias' and 'loc-alternate-name' commands (see website) # to view the entries in a LOCDB: pseq . loc-view --locdb resources/locdb.genes --group genes ## ## SEQDB: this has been pre-made (from a FASTA downloaded from UCSC) ## pseq . seq-summary --seqdb resources/seqdb ## to view a stretch of reference sequence: pseq . seq-view --seqdb resources/seqdb --region chr22:24000000..24000010 ## ## REFDB: Reference DBs (e.g. dbSNP, OMIM, etc) ## pseq . ref-summary --refdb resources/refdb.hgmd pseq . ref-view --refdb resources/refdb.hgmd --group hgmd --vmeta | head # same info, more human-readable pseq . ref-view --refdb resources/refdb.hgmd --group hgmd --verbose | less # note: REFDB was created from a site-only VCF file with command 'ref-load --vcf /path/to/vcf.gz' ############################################################################################## # # # Creating a PLINK/Seq project -- 24 Kenyan exomes # # # ############################################################################################## # Instead of working with a single raw VCF, typical mode to use plink/seq is as a 'project' # Requires initially loading the VCF into an internal datastore: pseq lwk new-project --resources resources/ --refdb resources/refdb.hgmd pseq lwk load-vcf --vcf vcf/LWK.vcf.gz pseq lwk var-summary ############################################################################################## # # # Variant level summary statistics # # # ############################################################################################## pseq lwk v-stats --stats mean=HM2,HM3 --mask reg=chr1 pseq lwk v-stats --stats count=DP:0-100,DP:200-1000,DP:1000-5000,DP:5000- mac=1,2,3,4,5,6-10,11-20,21- --mask reg=chr1 # can also calculate % in HGMD pseq lwk v-stats --stats ref=hgmd --mask reg=chr1 # pull these out (let's look at singletons only; displays carriers and their genotype data) pseq lwk rv-view --mask reg=chr1 ref.req=hgmd mac=1-1 --verbose --gmeta # filter on HGMD meta-information; 'CLASS', i.e. 'DM' is 'disease causing mutation' pseq lwk rv-view --mask reg=chr1 ref.req=hgmd mac=1-1 v-include=" hgmd_CLASS == 'DM' " --verbose --gmeta ############################################################################################## # # # Per-individual statistics # # # ############################################################################################## pseq lwk i-stats --stats gmean=GQ > istats.txt # View in R R d <- read.table( "istats.txt" , header = T ) # identify one outlier sample in terms of GQ (at nonreference sites) barplot( d$NRG.GQ ) plot( d$NALT , d$NRG.GQ ) cor.test( d$NALT , d$NRG.GQ ) q() ############################################################################################## # # # Masks and filtering # # # ############################################################################################## # based on meta-information pseq lwk v-view --vmeta --show DP MQ --mask reg=chr22 include="DP < 20 || MQ < 10" # do variants with relatively lower DP and/or MQ (as above) have lower Ts/Tv than # the majority of variants? What might that indicate? pseq lwk v-stats --mask include="DP < 20 || MQ < 10" # based on a gene, and individual pseq lwk v-matrix --mask gene=NLRP11 indiv=NA19472,NA19324 # scan for ALT calls based on a low GQ pseq lwk rv-view --gmeta --mask geno=GQ:le:10 --mask mac=1- | head # use genotype-level filters to set genotpes to missing pseq lwk v-view --gmeta --mask reg=chr1:876499 pseq lwk v-view --gmeta --mask reg=chr1:876499 include="gf(GQ>10)" # create new metrics on-the-fly based on the genotype and/or variant meta-information pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = g(GQ) " --verbose pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = min(g(GQ)) " --verbose pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = mean(g(GQ)) " --verbose pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = g(GT_A) " --verbose # i.e. make new single X variable that is minimum heterozygote GQ of all samples: pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = min( g( ifelse( GT_A == 1 , GQ , 99 ) ) ) " --verbose ############################################################################################## # # # Gene-based summaries # # # ############################################################################################## # group variants by gene (or any other factor defined in a LOCDB) pseq lwk g-view --mask loc.group=refseq pseq lwk g-view --geno --mask gene=GRIK3 pseq lwk g-view --mask gene=GRIK3 mac=1-5 --rarelist # use the exome browser to view data for a gene(s) (or region(s)) pbrowse lwk # gene-based summaries # note, this will include intronic variants also # note: 'empty.group' mask, to force output for genes w/ no variants pseq lwk g-stats --mask loc.group=genes empty.group --locdb resources/locdb.genes > g.txt R d <- read.table( "g.txt" , header=T ) head(d) hist( d$NVAR ) # how many genes with no variants called? table( d$NVAR == 0 ) ## Q: which gene has the greatest number of variants? ############################################################################################## # # # Annotation # # # ############################################################################################## ## Annotation # (just for chr1, for speed -- takes ~1 min) pseq lwk counts --annotate refseq --mask reg=chr1 > annot.counts # Exercise : using R -- load in file with read.table() # 1) how many missense, nonsense, etc? # 2) relationship between MAF and functional status? R d <- read.table("annot.counts",header=T,sep="\t") head( d ) table( d$FUNC ) table( d$CNT == 1 , d$FUNC == 'nonsense' | d$FUNC == 'esplice' ) fisher.test( d$CNT == 1 , d$FUNC == 'nonsense' | d$FUNC == 'esplice' ) ## more verbose annotation tool (only for chr22) pseq lwk lookup --annotate refseq --mask reg=chr22 > chr22-annot.txt # attach this as meta-information back into the project, and use filter to pull out nonsense variants, for example pseq lwk v-view --mask reg=chr22 meta.file=chr22-annot.txt v-include=" worst == 'nonsense' " \ --verbose --show transcript protein codon --ignore-warnings ## additional protein-domain based annotation using a PROTDB for annotation pseq lwk lookup --annotate refseq --mask reg=chr22 limit=10 --protdb resources/protdb ## gs-view pseq lwk gs-view --gene GRIK1 --annotate refseq --protdb resources/protdb --domain HMMPfam --domain HMMPfam pseq lwk gs-view --gene GRIK1 --annotate refseq --protdb resources/protdb --domain HMMPfam --domain HMMPfam --plot --out p1 R --vanilla < p1.gsview.R { view plotNM_000830.pdf } ############################################################################################## # # # Single site variant metrics # # # ############################################################################################## # pull out allele frequencies and HWE p-values pseq lwk v-freq --mask reg=chr20 | gcol VAR MAF HWE # look at genotype counts for some extreme HWD failures pseq lwk g-counts --mask hwe=0:1e-7 reg=chr20 ############################################################################################## # # # Make a combined project ('EUR') with GBR and TSI # # # ############################################################################################## pseq eur new-project --resources resources/ --refdb resources/refdb.hgmd pseq eur load-vcf --vcf vcf/GBR.vcf.gz vcf/TSI.vcf.gz ## add filename tags pseq eur tag-file --id 1 --name GBR pseq eur tag-file --id 2 --name TSI pseq eur var-summary ############################################################################################## # # # Some dummy 'phenotype' information # # # ############################################################################################## pseq eur i-view --from-vardb | head pseq eur i-view --from-vardb \ | awk ' BEGIN { printf "##phe,Integer\n#ID\tphe\n" } \ $4 == "GBR" { print $3,2 } \ $4 == "TSI" { print $3,1 } \ ' OFS="\t" \ > pop.phe less pop.phe pseq eur load-pheno --file pop.phe pseq eur i-view --phenotype phe ############################################################################################## # # # Comparing two VCFs # # # ############################################################################################## # variants seen in two files (chr1 only) pseq eur v-stats --mask obs.file.req=GBR,TSI reg=chr1 # 1+ alternate allele seen in TSI, not CEU pseq eur v-stats --mask alt.file=TSI alt.file.ex=GBR reg=chr1 ############################################################################################## # # # Ancestry inference and relatedness # # # ############################################################################################## tar -xzvf panel.tar.gz less panel.pfrq pseq lwk i-pop --file panel.pfrq --out out/lwk pseq eur i-pop --file panel.pfrq --out out/eur # Look at pairwise sharing (relatedness/contamination) pseq lwk ibs-matrix --mask mac=2-5 --long-format --two-counts ## Q: ## a) are there any pairs of individuals who appear to be strongly related ## b) bonus : create a MDS/PCA plot for these two samples ## variants unique to (or enriched in) a particular group # for the two individuals showing excess sharing (from ibs-matrix command, above) pseq lwk unique --indiv NA19334 NA19313 ############################################################################################## # # # Generate a PLINK dataset # # # ############################################################################################## pseq asw write-plink --mask mac=2- pseq asw write-ped --mask mac=2- --out out/f1 plink --tfile out/f1 --make-bed --out out/f1 plink --bfile out/f1 --genome --out out/f1-ibs plink --bfile out/f1 --read-genome --mds-plot 2 --out out/mds R d <- read.table("out/mds.mds",header=T) plot(d$C1,d$C2) q() fgrep NA19334 out/f1-ibs.genome ############################################################################################## # # # Single site association # # # ############################################################################################## # Fisher's exact; can use (stratified) permutation too pseq eur v-assoc --phenotype phe --mask reg=chr1 | gcol MINA MINU OBSA OBSU VAR OR P > a.txt # also a 'glm' command -- can include covariates, quantitative traits, etc ## look at results R d <- read.table("a.txt",header=T,sep="\t") plot( -log10( d$P ) ) d[ d$P < 1e-10 , ] # make a QQ plot qqplot <- function( pv , label = "" ) { pv <- pv[ is.finite(pv) ] n <- length(pv) j <- 1:n x <- (j-0.5)/n d <- sort(pv) mx <- range( -log10(pv) ) plot( -log10(x) , -log10(d) , ylim = mx , xlim=mx , type="n" , main = paste( label , "(" , length(pv) , "tests )" ) , cex.main=1, xlab="Expected -log10(P)" , ylab="Observed -log10(P)" ) lines( c(0,max(-log10(pv))), c(0,max(-log10(pv))) , col="red" ) points( -log10(x) , -log10(d) , ylim = mx , xlim=mx , xlab="Expected -log10(P)" , ylab="Observed -log10(P)" , cex=0.8, pch=21, bg="palegreen4" , col="darkgray" ) } q() ## what's going on with some of these calls? clearly unlikely genotype calls pseq eur rv-view --mask reg=chr1:47133811 --phenotype phe --gmeta ## suggests: a) more stringent filtering (w/ deep coverage data, often works well) ## b) tests based on likelihoods, expected dosages, instead of hard calls ############################################################################################## # # # Gene-based association # # # ############################################################################################## # a naive application of gene-based test (SKAT) to CEU vs TSI pseq eur assoc --phenotype phe --mask loc.group=genes --locdb resources/locdb.genes --mask mac=2-10 --out genebased awk ' $6 < 1e-2 ' genebased.assoc grep -w ZNF431 genebased.assoc.det pseq eur v-view --gmeta --mask reg=chr19:21349311,chr19:21350330 ## force all missing genotypes to reference/reference pseq eur assoc --phenotype phe --mask loc.group=genes assume-ref --locdb resources/locdb.genes --mask mac=2-10 --out genebased2 awk ' $6 < 1e-2 ' genebased2.assoc ## generate gene-based QQ plot R d <- read.table("genebased2.assoc",header=T) qqplot( d$P ) ############################################################################################## # # # Using the Rplinkseq interface # # # ############################################################################################## R library(Rplinkseq) pseq.project( "eur" ) # obtain a 'list' of variant and genotype information # note, fixed at 1000 variants being returned by default in Rplinkseq # add limit=10000 to mask to increase, for example k <- var.fetch( mask = "reg=chr22 file=GBR" ) length(k) str(k[[1]]) # get column from VCF d <- x.core( k , "BP1" ) str(d) # extract variant meta-information d2 <- x.consensus.meta( k , "DP" ) str(d2) plot( d , d2 ) # genotype by individual matrix d <- x.consensus.genotype( k ) str( d ) # histogram of allele frequencies hist( apply( d , 1 , mean , na.rm = T ) / 2 ) # get variant-meta information as a matrix with meta.fetch() -- more # effecient than retrieving whole variant-lists for large samples) d <- meta.fetch( c( "CHROM" , "POS" , "MQ" , "DP" ) , "file=GBR reg=chr7" ) head(d) # get phenotypes p <- ind.fetch( "phe" ) str(p) # get both GBR and TSI k <- var.fetch( mask = "reg=chr22 obs.file.req=GBR,TSI" ) d <- x.consensus.genotype( k ) t.test( d[ 1 , ] ~ p$phe ) # access the genotypes in this list table( x.consensus.genotype( k[1] ) , useNA = "ifany" ) # define, then apply a simple function f <- function(v) { cat("variant",v$CHR,v$BP1,v$CON$REF,"/",v$CON$ALT,"\n") } var.iterate( f , "limit=100" ) # second example: to calculate Ts/Tv ti <<- tv <<- 0 f_titv <- function( v ) { ifelse( sum( sort(c(v$CON$ALT,v$CON$REF)) == c("A","G") ) == 2 | sum( sort(c(v$CON$ALT,v$CON$REF)) == c("C","T") ) == 2 , ti <<- ti+1 , tv <<- tv+1 ) } # for speed, just look at first 10K variants var.iterate( f_titv , "limit=10000" ) ti / tv # a bit awkward and slower... but very flexible... ############################################################################################## # # # Geneset association tests # # # ############################################################################################## # BURDEN tests; condition just on 'singletons' # this generates 1000 null datasets by permutation and stores all "BURDEN" # statistics (of a greater burden of rare mutations in GBR vs TSI, in this case) pseq eur assoc --phenotype phe --tests burden --perm 1000 --dump-null-matrix \ --mask loc.group=genes assume-ref mac=1-1 --locdb resources/locdb.genes --out burden ## here's one I made earlier: cp ~pshaun/2013/extra/burden.matrix . cp ~pshaun/2013/extra/burden.assoc . cp ~pshaun/2013/extra/burden.assoc.det . less -S burden.matrix # grab the necessary auxiliary files for the set-test cp ~pshaun/2013/extra/gene.list cp ~pshaun/2013/extrat/equiv.txt # two generic 'gene-sets' cp ~pshaun/2013/extra/gene-families.set cp ~pshaun/2013/extra/tissues.set # run the set-based tests: smp -m gene.list equiv.txt tissues.set burden.matrix > set-tissues.txt grep ^SET set-tissues.txt # any 'enriched' sets awk ' $1 == "SET" && $4 < 0.01 ' set-tissues.txt # what are the genes? awk ' $NF == "mhc_class2_sw" ' set-tissues.txt smp -m gene.list equiv.txt gene-families.set burden.matrix > set-families.txt awk ' $1 == "SET" && $4 < 0.001 ' set-families.txt ## ## Aside: adaptive permutation with i-stats # (gives up early on non-significant genes) pseq eur assoc --phenotype phe --tests burden \ --mask loc.group=genes assume-ref mac=2-10 --locdb resources/locdb.genes --info --out b2 R d <- read.table( "b2.assoc" , header=T) hist( d$P , breaks = 50 ) hist( d$I , breaks=100 ) hist( d$P[ d$I < 0.1 ] , breaks = 50 , xlim=c(0,1) )