mkdir ~/ngs
cd ~/ngs

## Copy main data over
cp ~pshaun/pseq/example/ngs/* .

## Take a peek at the VCF
zless CEU.exon.2010_09.genotypes.vcf.gz

## Create a project
pseq ~/ngs/proj new-project --vcf *.vcf.gz --metameta ~/ngs/meta.meta --resources ~pshaun/pseq/resources/hg18

## Load the VCFs
pseq proj load-vcf

## Check it worked
pseq proj vardb-summary

## Add some more friendly file names
pseq proj tag-file --name CEU --id 1 
pseq proj tag-file --name TSI --id 2 
pseq proj vardb-summary

## Dump individual variants
pseq proj v-view
 
## View a subset of variants and sample-specific meta-information, 
## for first variants only
pseq proj v-view --vmeta --samples --verbose --mask limit=1

## View individual genotype data
pseq proj v-view --geno --gmeta  | head -20

## Load in individual "phenotype" information
less pop.phe
pseq proj load-pheno --file pop.phe
pseq proj i-view | head


## Browse the file using the web-browser
## 1) Go to http://workshop/pseq
## 2) Enter in project box:  /home/{user-name}/ngs/proj
## 3) Click on 'list' link of the 'Optional variant meta-fields' to see meta-fields in VCFs
## 4) CLick on 'list' link of the 'Optional case/control phenotype' to see phenotypes in project
## 5) Enter gene name 'GRIK3' in Gene ID box (top left) and click 'Fetch' button
## 6) Enter 'AA AC DP' in optional variant box; enter 'phe1' in phenotype box
## 7) Click on 'View' for the second variant in list 
## 8) Click on the link on the rsID to go to dbSNP
## 9) Click on an individual's ID to see all genotypes for that individual in GRIK3


## Annotate variants
pseq proj counts --mask reg=chr22 --options annotate

## Take a look at the reference/locus databases
pseq proj locdb-summary

## Look at individual genotypes for a gene
pseq proj g-view --geno --mask gene=GRIK3 

## Focus on one individual
pseq proj g-view --geno --gmeta --mask gene=GRIK3 indiv=NA20525

## Masks are described in more detail here: http://atgu.mgh.harvard.edu/plinkseq/masks.shtml

## Use masks to filter variants based on meta-information
pseq proj v-view --vmeta --samples --mask reg=chr7 include=" BN == 123 && DP < 2000 " 

## Which individuals cases carry rare-variants in a gene? (mac = minor allele count between 1 and 5)
pseq proj g-view --phenotype phe1 --mask gene=GRIK3 mac=1-5 --options rarelist


## Output data in differnt formats
pseq proj v-matrix --mask gene=NLRP11 indiv=NA20808,NA20774

## Output tables of variant meta-information
pseq proj meta-matrix --mask reg=chr22 file=TSI --show AN AA DP BN 

## Summary statistics per gene (chr1 and 2 only)
## calculates Ti/Tv, dbSNP%, G1K%, # singletons, etc
pseq proj i-stats --mask reg=chr1,chr2 > out.txt

## Examine distribution in R (first type 'R' on command line)
d <- read.table( "out.txt" , header=T )
str(d)

## load labels, and merge with data
p <- read.table("pop.phe",header=F)
names(p) <- c("ID" , "POP" )
d <- merge( d, p , by="ID" )

## distribution of # 'singletons' per individual
table( d$SING ) 

## call rate per individual (col. by batch)
plot( d$RATE , col = d$POP )

## Proportion of non-reference genotypes
plot( d$NALT / d$NOBS  , col = d$POP )

## Distribution of Ti/Tv
hist( d$TI.TV , breaks=20 ) 

## Distribution of dbSNP%
hist( d$DBSNP / d$NALT , breaks=20)

## Now quit R
q()


## We can also get similar summary statistics for the whole dataset,
## or per-gene (v-stats and g-stats commands), applying various masks
## also (i.e. to help determine thresholds to use for QC)


## Various lookups/filtering approaches to the data

## Intersect a list of variants (e.g. from another study)
less lookup.txt
pseq proj lookup --file lookup.txt --phenotype phe1

## Functional annotation of variants
pseq proj counts --phenotype phe1 --options annotate --mask reg=chr22

## PolyPhen2
## 3 cols:  variant   P(deleterious)   discrete (0,1,2,3)
pseq proj score-pph2 --name ~pshaun/pseq/resources/hg18/pph2.db | head 

## List variants that are unique to two individuals (and not in HapMap)
pseq proj unique --indiv NA11843 NA20772 --mask include=" ! ( HM2 || HM3 ) "

## Filter on variants not in dbSNP(130)
pseq proj v-view --mask ref.ex=dbSNP130

## Append HGMD annotations
pseq proj v-view --vmeta --verbose --mask ref=hgmd --refdb ~pshaun/pseq/resources/hg18/hgmd.db


##
## Association testing
##

## Look at general distribution of rare variants between CEU and TSI
pseq proj v-dist --phenotype phe1

## Single variant association tests
pseq proj v-assoc --phenotype phe1 > out.2

## some simple utilities to view the output
less out.2
cat out.2 | behead | less
cat out.2 | gcol VAR MINA MINU OR P| awk ' $4 < 0.001 '

# Some HWE outliers
gcol VAR HWE REFA HETA HOMA REFU HETU HOMU < out.2 | awk ' $2 < 1e-8 '

## permutation

## gene-based tests
pseq proj assoc --phenotype phe1 --mask loc.group=refseq mac=2-10 --options calpha > out.3

head out.3

awk ' NR ==1 || $6 < 1e-4 ' out.3 | behead

# Look at CES1 in exome browser: severe issues with 'non-random missing data'

# Permute but adjusting for missing data
pseq proj assoc --phenotype phe1 --mask loc.group=refseq mac=2-10 --options calpha fix-null > out.4

awk ' NR ==1 || $6 < 1e-4 ' out.4 | behead
awk ' NR ==1 || $6 < 1e-2 ' out.4 | behead

## 

d <- read.table("out.3" , header=T ) 
hist( d$I , breaks=200)

d <- read.table("out.4" , header=T ) 
hist( d$I , breaks=200)