###########################################
# Data
###########################################

## VCF exome data from 1000 Genomes

# GBR British in England and Scotland
# TSI Toscani in Italia
# LWK Luhya in Webuye, Kenya
# ASW Americans of African Ancestry in SW USA 

###########################################
# Section 1: Setting up and viewing data
###########################################

## make PSEQpractical directory
mkdir PSEQpractical


## change directory
cd PSEQpractical


## copy directory contents
cp /faculty/douglas/PSEQpractical/* ./


## symbolic link to resources
ln -s /opt/pseq/resources resources


## symbolic link to vcf
ln -s /opt/pseq/vcf vcf


## Make sure pseq runs
pseq


## test help menus
pseq help
pseq help views


## view VCF (type q to exit less)
zless -S vcf/GBR.vcf.gz
pseq vcf/GBR.vcf.gz meta-summary | less -S


## view VCF using pseq
pseq vcf/GBR.vcf.gz v-view | less -S


## view resource databases
## LOCDB: RefSeq gene transcripts (e.g. from UCSC, or HGNC website)
pseq . loc-summary --locdb resources/locdb


## SEQDB: this has been pre-made (from a FASTA downloaded from UCSC)
pseq . seq-summary --seqdb resources/seqdb 


## to view a stretch of reference sequence:
pseq . seq-view  --seqdb resources/seqdb  --region chr22:24000000..24000010 


## create pseq project
pseq lwk new-project --resources resources/


## load VCF (will take about a minute to load)
pseq lwk load-vcf --vcf vcf/LWK.vcf.gz


## look at pseq file using less
less -S lwk.pseq 


## viewing variant data (v-view, rv-view, mrv-view)
# view all variants
pseq lwk v-view | less -S


## view all non-reference variants with genotypes
pseq lwk rv-view | less -

## add varianat level meta information
pseq lwk rv-view --vmeta | less
pseq lwk rv-view --vmeta --show DP | less


## add genotype level meta information
pseq lwk rv-view --gmeta | less
pseq lwk rv-view --gmeta --show GQ | less


## individual summary
pseq lwk i-stats


## variant summary
pseq lwk v-stats


###########################################
# Section 2: Filtering and QC-ing data
###########################################

## Masks

## Get summary statistics for genomic regions
# i-stats
pseq lwk i-stats --mask reg=chr22


## v-stats
pseq lwk v-stats --mask reg=chr22:10000000-20000000
pseq lwk v-stats --mask reg=chr22 --stats gmean=GQ 
pseq lwk v-stats --mask reg=chr22  --stats count=DP:0-100,DP:200-1000,DP:1000-5000,DP:5000- mac=1,2-4,5-10,11-


## filtering in individuals
pseq lwk mrv-view --mask indiv=NA19350 reg=chr22:10000000-20000000


## filter out individuals
pseq lwk v-stats --mask reg=chr17 indiv.ex=NA19350


## filter out file of individuals
pseq lwk i-view | grep -v \# | awk '{print $1}' | head -n 5 > indiv.remove
pseq lwk v-stats --mask reg=chr17 indiv.ex=@indiv.remove


## filtering out any failing variants
pseq lwk v-stats --mask reg=chr22 any.filter.ex


##show variant level meta information per variant in matrix form
pseq lwk meta-matrix  --show DP MQ | less


## filtering genotypes
pseq lwk v-stats --mask reg=chr22 geno=GQ:ge:30


## make a new VCF of all variants with at least one alternate allele assuming missing calls are reference (filtered, here only chr22)
pseq lwk write-vcf --mask reg=chr22 assume-ref mac=1- > c22.vcf


pseq c22 new-project --resources resources/
pseq c22 load-vcf --vcf c22.vcf 


## gene-based summaries
pseq c22 g-view --mask loc.group=refseq | less
pseq c22 g-view --geno --mask gene=RAC2 


# gene-based summaries
# note, this will include intronic variants also
# note: 'empty.group' mask, to force output for genes w/ no variants

pseq lwk g-stats --mask loc.group=genes empty.group --locdb resources/locdb.genes | less


## Ancestry inference and relatedness
less resources/panel.pfrq
pseq lwk i-pop --file resources/panel.pfrq --out lwk


## Look at pairwise sharing (relatedness/contamination)
pseq lwk ibs-matrix --mask mac=2-5 --long-format --two-counts 


## variants unique to (or enriched in) a particular group
# for the two individuals showing excess sharing (from ibs-matrix command, above)

pseq lwk unique --indiv NA19334 NA19313


###########################################
# Section 3: Annotation
###########################################

## annotate all variants using refseq
pseq c22 lookup --annotate refseq  --mask no-geno no-vmeta > annot.txt


## let's look at the annotation output
less annot.txt

grep chr22:17469049 annot.txt

grep chr22:39498038 annot.txt


## create file with nonsysonymous variants from the "worst" annotation category
awk ' $2 == "worst" { print $1,$3 } ' annot.txt  > worst.txt

awk '{print $2}' worst.txt | sort | uniq -c

awk ' $2 == "missense" || $2 == "nonsense" { print $1 } ' worst.txt > ns.txt
awk ' $2 == "intronic" { print $1 } ' worst.txt > intronic.txt


# filter only on particular annotations
pseq c22 v-stats --mask ereg=@ns.txt


############################################
# Section 4: Variant/Gene based association
############################################

## create chromosome 22 VCF files from two different datasets
pseq vcf/GBR.vcf.gz write-vcf --mask reg=chr22 assume-ref mac=1- > gbr-c22.vcf
pseq vcf/TSI.vcf.gz write-vcf --mask reg=chr22 assume-ref mac=1- > tsi-c22.vcf


## Create a combined pseq project
pseq eur new-project --resources resources/
pseq eur load-vcf --vcf gbr-c22.vcf --vcf tsi-c22.vcf


## Look at dummy phenotype file (GBR as cases, TSI as controls)
less pop.phe
awk '{print $2}' pop.phe  | sort | uniq -c


## load phenotype file
pseq eur load-pheno --file pop.phe


## view individuals to see phenotypes have been loaded
pseq eur i-view | head


## get allele counts per variant by phenotype
pseq eur counts --pheno phe --mask limit=10


## single-site associaton (fisher's exact test)
pseq eur v-assoc --pheno phe > assoc.txt


## View single variant association
less -S assoc.txt


## gene-based association

pseq eur assoc --pheno phe --mask loc.group=refseq --out genebased
awk ' $6 < 0.01 ' genebased.assoc


grep -w NM_031890 genebased.assoc.det
pseq eur rv-view --mask reg=chr22:17600497 --pheno phe --gmeta


## annotate eur chr22
pseq eur lookup --annotate refseq  --mask no-geno no-vmeta > annot2.txt


## pull out nonsynonymous variants
awk '$2 == "worst" && ($3 ~ "nonsense" || $3 ~ "missense") {print $1}'  annot2.txt > ns2.txt


## burden test, using permutation, restricted to NS variants
pseq eur assoc --pheno phe --mask loc.group=genes --locdb resources/locdb.genes --mask ereg.req=@ns2.txt --tests burden vt  --out genebased2
awk ' $6 < 0.01 ' genebased2.assoc

 
grep SHANK3 genebased2.assoc.det 
pseq eur rv-view --gmeta --mask ereg.req=@ns2.txt gene=SHANK3 


############################################
# Section 5: Gene set tests
############################################

# BURDEN tests; condition just on 'singletons'

# this generates 1000 null datasets by permutation and stores all "BURDEN" 
# statistics (of a greater burden of rare mutations in GBR vs TSI, in this case)

pseq eur assoc --phenotype phe --tests burden --perm 1000 --dump-null-matrix \
               --mask loc.group=genes assume-ref mac=1-1 ereg.req=@ns.txt --locdb resources/locdb.genes  --out burden-ns


less -S burden.matrix

# necessary auxiliary files for the set-test
less gene.list
less equiv.txt


## generic 'gene-set' 
less gene-families.set 

## make alias
alias smp="./smp"


## run the set-based tests:

smp -m gene.list equiv.txt gene-families.set burden.matrix > set-gene-families.txt


# any 'enriched' sets
awk ' $1 == "SET" && $4 < 0.01 ' set-gene-families.txt


#################################################
# Additional topics (Just examples, not tested!)
#################################################

##Creating varsets

## annotate all variants to be gene/allele specific (will take a long time to do whole genome)
pseq eur lookup --annotate refseq  --mask no-geno no-vmeta --worstByAlias symbol > annot.txt

* create varsets

awk ' $2 == "worst_by_alias_symbol_alt" { printf $1"\t"$3 } \
      $2 == "worst_by_alias_symbol"     { printf   "\t"$3 } \
      $2 == "alias_symbol"              { printf   "\t"$3"\n" } ' annot.txt | \
    tr ',' '\t' | tr ' ' '\t' | awk -F"\t" ' { n=(NF-1)/3 ; for(i=0;i<n;i++) printf "VAR\t"$1"\t"$(2+2*n+i)"|"$(2+n+i)"\t"$(2+i)"\n" } ' > annot2genes

** disruptive

egrep -w "(nonsense|esplice|frameshift)" annot2genes | tr '|' '\t' | awk ' { print $1,$3"|disruptive",$2,$5 } ' OFS="\t" > disruptive.varset
awk ' { print "disruptive" , $2 } ' OFS="\t" disruptive.varset | sort | uniq > disruptive.varsuperset

pseq c22 var-set --file ${FILES}/disruptive.varset
pseq c22 var-superset --file ${FILES}/disruptive.varsuperset


## checks
pseq $PROJ var-summary | less

## just pick first 10 people to remove
pseq eur i-view | grep -v \# | awk '{print $1}' | head > indiv.remove


## can now use varsets to filter or perform more advanced operation like genic association
pseq eur assoc --phenotype phe \
       --mask varset.group=disruptive \
       --mask mac=1-5 indiv.ex=@indiv.remove \
       --mask any.filter.ex geno=GQ:ge:90,DP:ge:10 \
       --out disruptive-mac1-5