##############################################################################################
#                                                                                            #
# Setting things up                                                                          #
#                                                                                            #
##############################################################################################

mkdir pseq-practical
cd pseq-practical
cp /opt/pseq/data/exomes.tar.gz . 
tar -xzvf exome.tar.gz
## This is the efficient way to do it at the workshop
ln -s /opt/pseq/resources

## Don't run these commands that start with "#"
#mkdir resources
#cp /opt/pseq/resources/* resources/


## Move to working directory for this practical
## Check folder contents

ls -lR


## Check pseq is installed correctly

pseq

pseq help

pseq help views

pseq help v-view


##############################################################################################
#                                                                                            #
# The exome sequence data (VCFs)                                                             #
#                                                                                            #
##############################################################################################

## Get VCF exome data from 1000 Genomes FTP

# GBR British in England and Scotland
# TSI Toscani in Italia
# LWK Luhya in Webuye, Kenya

# cd vcf
# wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110810_exome_consensus_snps/GBR.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz
# wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110810_exome_consensus_snps/TSI.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz
# wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110810_exome_consensus_snps/LWK.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz
# cd ..

## Note: pre-processing step; these are distributed to have exactly the same number of rows (DONE)

# for s in GBR TSI LWK ASW
# do
#  zgrep ^# vcf/${s}.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz > vcf/${s}.vcf
# gzcat vcf/${s}.wex.illumina_mosaik_svm_consensus_wcmc.20110521.exome.genotypes.vcf.gz | awk ' $9 == "GT:PL:GQ" ' >> vcf/${s}.vcf
# gzip vcf/${s}.vcf
# done


##############################################################################################
#                                                                                            #
# Looking at the VCF contents: variants, genotypes and meta-data                             #
#                                                                                            #
##############################################################################################

## Quick look at the files
# (press 'q' to quit)

zless -S vcf/LWK.vcf.gz


## Use PSEQ to look at the VCF (will simply scan through variants)
# the head command means only print first ten lines

pseq vcf/LWK.vcf.gz v-view | head

 
## Look in more detail at first variant (and its meta-information)
# use a 'mask' to limit the output to first two variants

pseq vcf/LWK.vcf.gz v-view --vmeta --mask limit=2 


## subset of same information, in different, rectangular format

pseq vcf/LWK.vcf.gz  meta-matrix  --mask limit=2 --show DP MQ INDEL 


## remind me what the tags mean:

pseq vcf/LWK.vcf.gz meta-summary


## looking at genotype (meta-)information

pseq vcf/LWK.vcf.gz  v-view  --geno --gmeta --mask limit=2


## output a filtered VCF (i.e. only include two variant tags,
## MQ and GQ, and genotypes for two individuals, for
## a 2Mb region on chromosome 7)

pseq vcf/LWK.vcf.gz  write-vcf  --show MQ GQ  --mask indiv=NA19472,NA19324  --mask reg=chr7:12000000..14000000


##############################################################################################
#                                                                                            #
# Resource files                                                                             #
#                                                                                            #
##############################################################################################

# cd resources

# wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/locdb
# wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/locdb.genes
# wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/protdb
# wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/seqdb
# wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/refdb
# wget ftp://atguftp.mgh.harvard.edu/plinkseq/hg19/panel.tar.gz


##
## LOCDB: RefSeq gene transcripts (e.g. from UCSC, or HGNC website)
##

# two LOCDBs: by transcripts (for annotation), and by genes (for gene-based tests)

pseq . loc-summary --locdb resources/locdb

pseq . loc-summary --locdb resources/locdb.genes

# first built from GTF file: 

zless -S resources/refseq.gtf.gz

# utility to look for 'rogue transcripts', typically indicative of problems aligning gene  
#  ( CDS not mod 3;  mapped to an unknown chromosome (not in SEQDB); maps to multiple chromosomes; 
#    CDS too short; overlapping exons; inconsistent strand; etc )

pseq . gtf-check --file resources/refseq.gtf.gz --seqdb resources/seqdb 

# Make the LOCDB  (already done, no need to repeat)
# pseq . loc-load --file refseq.gtf.gz --locdb resources/locdb

# Note, also added gene symbols with 'load-alias' and 'loc-alternate-name' commands (see website)

# to view the entries in a LOCDB: 

pseq . loc-view --locdb resources/locdb.genes --group genes 


##
## SEQDB: this has been pre-made (from a FASTA downloaded from UCSC)
##

pseq . seq-summary --seqdb resources/seqdb 


## to view a stretch of reference sequence:

pseq . seq-view  --seqdb resources/seqdb  --region chr22:24000000..24000010 


##
## REFDB: Reference DBs  (e.g. dbSNP, OMIM, etc)
##

pseq . ref-summary  --refdb resources/refdb.hgmd

pseq . ref-view     --refdb resources/refdb.hgmd --group hgmd --vmeta  | head 

# same info, more human-readable
pseq . ref-view     --refdb resources/refdb.hgmd --group hgmd --verbose  | less 

# note: REFDB was created from a site-only VCF file with command  'ref-load  --vcf /path/to/vcf.gz'


##############################################################################################
#                                                                                            #
# Creating a PLINK/Seq project -- 24 Kenyan exomes                                           #
#                                                                                            #
##############################################################################################

# Instead of working with a single raw VCF, typical mode to use plink/seq is as a 'project'
# Requires initially loading the VCF into an internal datastore:

pseq lwk new-project --resources resources/  --refdb resources/refdb.hgmd

pseq lwk load-vcf --vcf vcf/LWK.vcf.gz

pseq lwk var-summary


##############################################################################################
#                                                                                            #
# Variant level summary statistics                                                           #
#                                                                                            #
##############################################################################################


pseq lwk v-stats --stats mean=HM2,HM3 --mask reg=chr1

pseq lwk v-stats --stats count=DP:0-100,DP:200-1000,DP:1000-5000,DP:5000-  mac=1,2,3,4,5,6-10,11-20,21- --mask reg=chr1

# can also calculate % in HGMD 

pseq lwk v-stats --stats ref=hgmd  --mask reg=chr1

# pull these out (let's look at singletons only; displays carriers and their genotype data)

pseq lwk rv-view --mask reg=chr1 ref.req=hgmd mac=1-1 --verbose --gmeta 

# filter on HGMD meta-information; 'CLASS', i.e. 'DM' is 'disease causing mutation'

pseq lwk rv-view --mask reg=chr1 ref.req=hgmd mac=1-1 v-include=" hgmd_CLASS == 'DM' "  --verbose --gmeta


##############################################################################################
#                                                                                            #
# Per-individual statistics                                                                  #
#                                                                                            #
##############################################################################################


pseq lwk i-stats --stats gmean=GQ > istats.txt


# View in R

R
d <- read.table( "istats.txt" , header = T ) 

# identify one outlier sample in terms of GQ (at nonreference sites)
barplot( d$NRG.GQ ) 

plot( d$NALT , d$NRG.GQ ) 

cor.test( d$NALT , d$NRG.GQ ) 

q()


##############################################################################################
#                                                                                            #
# Masks and filtering                                                                        #
#                                                                                            #
##############################################################################################

# based on meta-information
pseq lwk v-view --vmeta --show DP MQ --mask reg=chr22 include="DP < 20 || MQ < 10" 

# do variants with relatively lower DP and/or MQ (as above) have lower Ts/Tv than 
# the majority of variants?  What might that indicate?

pseq lwk v-stats --mask include="DP < 20 || MQ < 10" 


# based on a gene, and individual
pseq lwk v-matrix --mask gene=NLRP11 indiv=NA19472,NA19324


# scan for ALT calls based on a low GQ 
pseq lwk rv-view --gmeta --mask geno=GQ:le:10 --mask mac=1- | head

# use genotype-level filters to set genotpes to missing
pseq lwk v-view --gmeta --mask reg=chr1:876499 

pseq lwk v-view --gmeta --mask reg=chr1:876499 include="gf(GQ>10)"

# create new metrics on-the-fly based on the genotype and/or variant meta-information

pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = g(GQ) " --verbose
pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = min(g(GQ)) " --verbose
pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = mean(g(GQ)) " --verbose
pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = g(GT_A) " --verbose

# i.e. make new single X variable that is minimum heterozygote GQ of all samples:
pseq lwk v-view --gmeta --mask reg=chr1:876499 include="X = min( g( ifelse( GT_A == 1 , GQ , 99 ) ) ) " --verbose


##############################################################################################
#                                                                                            #
# Gene-based summaries                                                                       #
#                                                                                            #
##############################################################################################

# group variants by gene (or any other factor defined in a LOCDB)

pseq lwk g-view --mask loc.group=refseq 

pseq lwk g-view --geno --mask gene=GRIK3 

pseq lwk g-view --mask gene=GRIK3 mac=1-5  --rarelist

# use the exome browser to view data for a gene(s) (or region(s))

pbrowse lwk


# gene-based summaries
# note, this will include intronic variants also
# note: 'empty.group' mask, to force output for genes w/ no variants

pseq lwk g-stats --mask loc.group=genes empty.group --locdb resources/locdb.genes > g.txt

R

d <- read.table( "g.txt" , header=T )

head(d)

hist( d$NVAR ) 

# how many genes with no variants called?
 
table( d$NVAR == 0 ) 

## Q: which gene has the greatest number of variants?


##############################################################################################
#                                                                                            #
# Annotation                                                                                 #
#                                                                                            #
##############################################################################################


## Annotation

# (just for chr1, for speed -- takes ~1 min)

pseq lwk counts --annotate refseq --mask reg=chr1 > annot.counts


# Exercise : using R -- load in file with read.table()
#  1) how many missense, nonsense, etc?
#  2) relationship between MAF and functional status?

R
d <- read.table("annot.counts",header=T,sep="\t")

head( d )
table( d$FUNC ) 

table( d$CNT == 1 ,  d$FUNC == 'nonsense' | d$FUNC == 'esplice' ) 
fisher.test( d$CNT == 1 ,  d$FUNC == 'nonsense' | d$FUNC == 'esplice' ) 


## more verbose annotation tool (only for chr22)

pseq lwk lookup --annotate refseq  --mask reg=chr22 > chr22-annot.txt 


# attach this as meta-information back into the project, and use filter to pull out nonsense variants, for example

pseq lwk v-view --mask reg=chr22  meta.file=chr22-annot.txt  v-include=" worst == 'nonsense' " \
                --verbose  --show transcript protein codon  --ignore-warnings


## additional protein-domain based annotation using a PROTDB for annotation

pseq lwk lookup --annotate refseq  --mask reg=chr22 limit=10  --protdb resources/protdb  


## gs-view 

pseq lwk gs-view --gene GRIK1 --annotate refseq --protdb resources/protdb --domain HMMPfam --domain HMMPfam

pseq lwk gs-view --gene GRIK1 --annotate refseq --protdb resources/protdb --domain HMMPfam --domain HMMPfam  --plot --out p1

R --vanilla < p1.gsview.R 

{ view plotNM_000830.pdf } 


##############################################################################################
#                                                                                            #
# Single site variant metrics                                                                #
#                                                                                            #
##############################################################################################


# pull out allele frequencies and HWE p-values
pseq lwk v-freq --mask reg=chr20 | gcol VAR MAF HWE 

# look at genotype counts for some extreme HWD failures
pseq lwk g-counts --mask hwe=0:1e-7 reg=chr20 


##############################################################################################
#                                                                                            #
# Make a combined project ('EUR') with GBR and TSI                                           #
#                                                                                            #
##############################################################################################


pseq eur new-project --resources resources/  --refdb resources/refdb.hgmd

pseq eur load-vcf --vcf vcf/GBR.vcf.gz vcf/TSI.vcf.gz 

## add filename tags

pseq eur tag-file --id 1 --name GBR

pseq eur tag-file --id 2 --name TSI

pseq eur var-summary


##############################################################################################
#                                                                                            #
# Some dummy 'phenotype' information                                                         #
#                                                                                            #
##############################################################################################

pseq eur i-view --from-vardb | head

pseq eur i-view --from-vardb \
 | awk ' BEGIN       { printf "##phe,Integer\n#ID\tphe\n" } \
         $4 == "GBR" { print $3,2 } \
         $4 == "TSI" { print $3,1 } \ 
       ' OFS="\t" \
 > pop.phe

less pop.phe 

pseq eur load-pheno --file pop.phe 

pseq eur i-view --phenotype phe 


##############################################################################################
#                                                                                            #
# Comparing two VCFs                                                                         #
#                                                                                            #
##############################################################################################


# variants seen in two files (chr1 only)

pseq eur v-stats --mask obs.file.req=GBR,TSI  reg=chr1

# 1+ alternate allele seen in TSI, not CEU

pseq eur v-stats --mask alt.file=TSI alt.file.ex=GBR   reg=chr1


##############################################################################################
#                                                                                            #
# Ancestry inference and relatedness                                                         #
#                                                                                            #
##############################################################################################


tar -xzvf panel.tar.gz

less panel.pfrq

pseq lwk i-pop --file panel.pfrq --out out/lwk

pseq eur i-pop --file panel.pfrq --out out/eur


# Look at pairwise sharing (relatedness/contamination)

pseq lwk ibs-matrix --mask mac=2-5 --long-format --two-counts 


## Q:
##  a) are there any pairs of individuals who appear to be strongly related 
##  b) bonus : create a MDS/PCA plot for these two samples


## variants unique to (or enriched in) a particular group
# for the two individuals showing excess sharing (from ibs-matrix command, above)

pseq lwk unique --indiv NA19334 NA19313


##############################################################################################
#                                                                                            #
# Generate a PLINK dataset                                                                   #
#                                                                                            #
##############################################################################################

pseq asw write-plink --mask mac=2-

pseq asw write-ped --mask mac=2- --out out/f1

plink --tfile out/f1 --make-bed --out out/f1

plink --bfile out/f1 --genome --out out/f1-ibs

plink --bfile out/f1 --read-genome --mds-plot 2 --out out/mds

R
d <- read.table("out/mds.mds",header=T)
plot(d$C1,d$C2)
q()

fgrep NA19334 out/f1-ibs.genome 


##############################################################################################
#                                                                                            #
# Single site association                                                                    #
#                                                                                            #
##############################################################################################


# Fisher's exact;  can use (stratified) permutation too 

pseq eur v-assoc --phenotype phe --mask reg=chr1 | gcol MINA MINU OBSA OBSU VAR OR P > a.txt

# also a 'glm' command -- can include covariates, quantitative traits, etc


## look at results

R
d <- read.table("a.txt",header=T,sep="\t")

plot( -log10( d$P ) ) 

d[ d$P < 1e-10 , ] 

# make a QQ plot

qqplot <- function( pv , label = "" )
{
 pv <- pv[ is.finite(pv) ]
 n <- length(pv)
 j <- 1:n
 x <- (j-0.5)/n
 d <- sort(pv)
 mx <- range( -log10(pv) )
 plot( -log10(x) , -log10(d) , ylim = mx , xlim=mx , type="n" ,  main = paste( label , "(" , length(pv) , "tests )" ) , cex.main=1, xlab="Expected -log10(P)" , ylab="Observed -log10(P)"  ) 
 lines( c(0,max(-log10(pv))), c(0,max(-log10(pv))) , col="red" ) 
 points( -log10(x) , -log10(d) , ylim = mx , xlim=mx , xlab="Expected -log10(P)" , ylab="Observed -log10(P)" , cex=0.8, pch=21, bg="palegreen4" , col="darkgray" ) 
}

q()


## what's going on with some of these calls?  clearly unlikely genotype calls

pseq eur rv-view  --mask reg=chr1:47133811 --phenotype phe  --gmeta 

## suggests: a) more stringent filtering (w/ deep coverage data, often works well)
##           b) tests based on likelihoods, expected dosages, instead of hard calls


##############################################################################################
#                                                                                            #
# Gene-based association                                                                     #
#                                                                                            #
##############################################################################################


# a naive application of gene-based test (SKAT) to CEU vs TSI 

pseq eur assoc --phenotype phe --mask loc.group=genes --locdb resources/locdb.genes  --mask mac=2-10  --out genebased

awk ' $6 < 1e-2 ' genebased.assoc

grep -w ZNF431 genebased.assoc.det

pseq eur v-view --gmeta --mask reg=chr19:21349311,chr19:21350330


## force all missing genotypes to reference/reference 

pseq eur assoc --phenotype phe --mask loc.group=genes assume-ref --locdb resources/locdb.genes  --mask mac=2-10  --out genebased2

awk ' $6 < 1e-2 ' genebased2.assoc

## generate gene-based QQ plot

R
d <- read.table("genebased2.assoc",header=T)
qqplot( d$P ) 


##############################################################################################
#                                                                                            #
# Using the Rplinkseq interface                                                              #
#                                                                                            #
##############################################################################################

R

library(Rplinkseq)

pseq.project( "eur" )

# obtain a 'list' of variant and genotype information
# note, fixed at 1000 variants being returned by default in Rplinkseq
# add limit=10000 to mask to increase, for example

k <- var.fetch( mask = "reg=chr22 file=GBR" ) 

length(k) 

str(k[[1]])

# get column from VCF

d <- x.core( k , "BP1" ) 

str(d) 


# extract variant meta-information

d2 <- x.consensus.meta( k , "DP" )  
str(d2) 

plot( d , d2 ) 

# genotype by individual matrix

d <- x.consensus.genotype( k ) 
str( d ) 

# histogram of allele frequencies

hist( apply( d , 1 , mean , na.rm = T ) / 2  )  

# get variant-meta information as a matrix with meta.fetch() -- more
# effecient than retrieving whole variant-lists for large samples)

d <- meta.fetch( c( "CHROM" , "POS" , "MQ" , "DP" ) , "file=GBR reg=chr7" ) 
head(d)


# get phenotypes

p <- ind.fetch( "phe" ) 
str(p)

# get both GBR and TSI

k <- var.fetch( mask = "reg=chr22 obs.file.req=GBR,TSI" ) 
d <- x.consensus.genotype( k ) 
t.test( d[ 1 , ] ~ p$phe ) 

# access the genotypes in this list

table( x.consensus.genotype( k[1] ) , useNA = "ifany" ) 


# define, then apply a simple function

f <- function(v) 
{ 
 cat("variant",v$CHR,v$BP1,v$CON$REF,"/",v$CON$ALT,"\n") 
} 

var.iterate( f , "limit=100" )

# second example: to calculate Ts/Tv

ti <<- tv <<- 0 

f_titv <- function( v ) {  
 ifelse( sum( sort(c(v$CON$ALT,v$CON$REF)) == c("A","G") ) == 2 | 
         sum( sort(c(v$CON$ALT,v$CON$REF)) == c("C","T") ) == 2 , 
         ti <<- ti+1 ,  tv <<- tv+1 )  
}  

# for speed, just look at first 10K variants
var.iterate( f_titv , "limit=10000" )  

ti / tv

# a bit awkward and slower...    but very flexible... 


##############################################################################################
#                                                                                            #
# Geneset association tests                                                                  #
#                                                                                            #
##############################################################################################

# BURDEN tests; condition just on 'singletons'

# this generates 1000 null datasets by permutation and stores all "BURDEN" 
# statistics (of a greater burden of rare mutations in GBR vs TSI, in this case)

pseq eur assoc --phenotype phe --tests burden --perm 1000 --dump-null-matrix \
               --mask loc.group=genes assume-ref mac=1-1 --locdb resources/locdb.genes  --out burden

## here's one I made earlier:
cp ~pshaun/2013/extra/burden.matrix .
cp ~pshaun/2013/extra/burden.assoc .
cp ~pshaun/2013/extra/burden.assoc.det .

less -S burden.matrix

# grab the necessary auxiliary files for the set-test

cp ~pshaun/2013/extra/gene.list
cp ~pshaun/2013/extrat/equiv.txt

# two generic 'gene-sets' 

cp ~pshaun/2013/extra/gene-families.set 
cp ~pshaun/2013/extra/tissues.set 


# run the set-based tests:

smp -m gene.list equiv.txt tissues.set burden.matrix > set-tissues.txt

grep ^SET set-tissues.txt

# any 'enriched' sets
awk ' $1 == "SET" && $4 < 0.01 ' set-tissues.txt

# what are the genes?
awk ' $NF == "mhc_class2_sw" ' set-tissues.txt


smp -m gene.list equiv.txt gene-families.set burden.matrix > set-families.txt

awk ' $1 == "SET" && $4 < 0.001 ' set-families.txt


##
## Aside: adaptive permutation with i-stats

# (gives up early on non-significant genes)

pseq eur assoc --phenotype phe --tests burden \
               --mask loc.group=genes assume-ref mac=2-10 --locdb resources/locdb.genes --info --out b2

R 

d <- read.table( "b2.assoc" , header=T)
hist( d$P , breaks = 50 ) 
hist( d$I , breaks=100 ) 
hist( d$P[ d$I < 0.1 ]  , breaks = 50 , xlim=c(0,1) )