mkdir ~/ngs cd ~/ngs ## Copy main data over cp ~pshaun/pseq/example/ngs/* . ## Take a peek at the VCF zless CEU.exon.2010_09.genotypes.vcf.gz ## Create a project pseq ~/ngs/proj new-project --vcf *.vcf.gz --metameta ~/ngs/meta.meta --resources ~pshaun/pseq/resources/hg18 ## Load the VCFs pseq proj load-vcf ## Check it worked pseq proj vardb-summary ## Add some more friendly file names pseq proj tag-file --name CEU --id 1 pseq proj tag-file --name TSI --id 2 pseq proj vardb-summary ## Dump individual variants pseq proj v-view ## View a subset of variants and sample-specific meta-information, ## for first variants only pseq proj v-view --vmeta --samples --verbose --mask limit=1 ## View individual genotype data pseq proj v-view --geno --gmeta | head -20 ## Load in individual "phenotype" information less pop.phe pseq proj load-pheno --file pop.phe pseq proj i-view | head ## Browse the file using the web-browser ## 1) Go to http://workshop/pseq ## 2) Enter in project box: /home/{user-name}/ngs/proj ## 3) Click on 'list' link of the 'Optional variant meta-fields' to see meta-fields in VCFs ## 4) CLick on 'list' link of the 'Optional case/control phenotype' to see phenotypes in project ## 5) Enter gene name 'GRIK3' in Gene ID box (top left) and click 'Fetch' button ## 6) Enter 'AA AC DP' in optional variant box; enter 'phe1' in phenotype box ## 7) Click on 'View' for the second variant in list ## 8) Click on the link on the rsID to go to dbSNP ## 9) Click on an individual's ID to see all genotypes for that individual in GRIK3 ## Annotate variants pseq proj counts --mask reg=chr22 --options annotate ## Take a look at the reference/locus databases pseq proj locdb-summary ## Look at individual genotypes for a gene pseq proj g-view --geno --mask gene=GRIK3 ## Focus on one individual pseq proj g-view --geno --gmeta --mask gene=GRIK3 indiv=NA20525 ## Masks are described in more detail here: http://atgu.mgh.harvard.edu/plinkseq/masks.shtml ## Use masks to filter variants based on meta-information pseq proj v-view --vmeta --samples --mask reg=chr7 include=" BN == 123 && DP < 2000 " ## Which individuals cases carry rare-variants in a gene? (mac = minor allele count between 1 and 5) pseq proj g-view --phenotype phe1 --mask gene=GRIK3 mac=1-5 --options rarelist ## Output data in differnt formats pseq proj v-matrix --mask gene=NLRP11 indiv=NA20808,NA20774 ## Output tables of variant meta-information pseq proj meta-matrix --mask reg=chr22 file=TSI --show AN AA DP BN ## Summary statistics per gene (chr1 and 2 only) ## calculates Ti/Tv, dbSNP%, G1K%, # singletons, etc pseq proj i-stats --mask reg=chr1,chr2 > out.txt ## Examine distribution in R (first type 'R' on command line) d <- read.table( "out.txt" , header=T ) str(d) ## load labels, and merge with data p <- read.table("pop.phe",header=F) names(p) <- c("ID" , "POP" ) d <- merge( d, p , by="ID" ) ## distribution of # 'singletons' per individual table( d$SING ) ## call rate per individual (col. by batch) plot( d$RATE , col = d$POP ) ## Proportion of non-reference genotypes plot( d$NALT / d$NOBS , col = d$POP ) ## Distribution of Ti/Tv hist( d$TI.TV , breaks=20 ) ## Distribution of dbSNP% hist( d$DBSNP / d$NALT , breaks=20) ## Now quit R q() ## We can also get similar summary statistics for the whole dataset, ## or per-gene (v-stats and g-stats commands), applying various masks ## also (i.e. to help determine thresholds to use for QC) ## Various lookups/filtering approaches to the data ## Intersect a list of variants (e.g. from another study) less lookup.txt pseq proj lookup --file lookup.txt --phenotype phe1 ## Functional annotation of variants pseq proj counts --phenotype phe1 --options annotate --mask reg=chr22 ## PolyPhen2 ## 3 cols: variant P(deleterious) discrete (0,1,2,3) pseq proj score-pph2 --name ~pshaun/pseq/resources/hg18/pph2.db | head ## List variants that are unique to two individuals (and not in HapMap) pseq proj unique --indiv NA11843 NA20772 --mask include=" ! ( HM2 || HM3 ) " ## Filter on variants not in dbSNP(130) pseq proj v-view --mask ref.ex=dbSNP130 ## Append HGMD annotations pseq proj v-view --vmeta --verbose --mask ref=hgmd --refdb ~pshaun/pseq/resources/hg18/hgmd.db ## ## Association testing ## ## Look at general distribution of rare variants between CEU and TSI pseq proj v-dist --phenotype phe1 ## Single variant association tests pseq proj v-assoc --phenotype phe1 > out.2 ## some simple utilities to view the output less out.2 cat out.2 | behead | less cat out.2 | gcol VAR MINA MINU OR P| awk ' $4 < 0.001 ' # Some HWE outliers gcol VAR HWE REFA HETA HOMA REFU HETU HOMU < out.2 | awk ' $2 < 1e-8 ' ## permutation ## gene-based tests pseq proj assoc --phenotype phe1 --mask loc.group=refseq mac=2-10 --options calpha > out.3 head out.3 awk ' NR ==1 || $6 < 1e-4 ' out.3 | behead # Look at CES1 in exome browser: severe issues with 'non-random missing data' # Permute but adjusting for missing data pseq proj assoc --phenotype phe1 --mask loc.group=refseq mac=2-10 --options calpha fix-null > out.4 awk ' NR ==1 || $6 < 1e-4 ' out.4 | behead awk ' NR ==1 || $6 < 1e-2 ' out.4 | behead ## d <- read.table("out.3" , header=T ) hist( d$I , breaks=200) d <- read.table("out.4" , header=T ) hist( d$I , breaks=200)