from hail import *
from hail.representation import *
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn
from pprint import pprint
hc = HailContext()
vds = hc.read('hail-practical/1kg.vds').variant_qc().persist()
af = vds.query_variants('variants.map(v => va.qc.AF).collect()')
plt.hist(af, bins=100)
plt.show()
# This will always work!
af_hist = vds.query_variants('variants.map(v => va.qc.AF).hist(0, 1, 100)')
plt.bar(af_hist.binEdges[:-1], af_hist.binFrequencies, width=0.01)
plt.xlim(0, 1)
plt.show()
Not all base pairs are created equal. A "CG" motif (see wikipedia) has a high probability of mutating to "TG".
On the opposite strand, a "GC" motif is likely to mutate to "AC".
We expect to see many C/T and G/A SNPs in our dataset. Do we?
alleles = vds.query_variants('variants.map(v => v.altAllele).counter()')
from collections import Counter
pprint(Counter(alleles).most_common())
G/A | 15689986
C/T | 15685250
A/G | 10779869
T/C | 10736805
G/T | 3735538
C/A | 3727525
C/G | 3436122
G/C | 3425025
T/G | 2861302
A/C | 2860428
A/T | 2715606
T/A | 2713458
contigs = vds.query_variants('variants.map(v => v.contig).counter()')
pprint(Counter(contigs).most_common())
GQ is "genotype quality", which is roughly the log-scaled probability that your genotype is called wrong.
GQ 10 means 90% confidence.
GQ 20 means 99% confidence.
GQ 30 means 99.9% confidence.
GQ is truncated at 99, which corresponds to a ~ 0.0000000001 chance that your call is wrong (if it's calibrated!) | If we plot a histogram of GQ values, what will it look like?
DP is "depth", which is the total number of reads for a given sample at a given variant.
[gq_hist, dp_hist] = vds.query_genotypes(['gs.map(g => g.gq).hist(0, 100, 100)',
'gs.map(g => g.dp).hist(0, 30, 30)'])
plt.xlim(0, 100)
plt.ylim(0, 2500000)
plt.xlabel('GQ')
plt.ylabel('Count')
plt.title('GQ Histogram')
plt.bar(gq_hist.binEdges[:-1], gq_hist.binFrequencies, width=1, label='GQ')
plt.legend()
plt.show()
plt.xlim(0, 30)
plt.ylim(0, 3500000)
plt.xlabel('DP')
plt.ylabel('Count')
plt.title('DP Histogram')
plt.bar(dp_hist.binEdges[:-1], dp_hist.binFrequencies, width=1, label='DP')
plt.legend()
plt.show()
[het_gq_hist, hom_gq_hist] = vds.query_genotypes(['gs.filter(g => g.isHet).map(g => g.gq).hist(0, 100, 100)',
'gs.filter(g => !g.isHet).map(g => g.gq).hist(0, 100, 100)'])
plt.xlim(0, 100)
plt.ylim(0, 2000000)
plt.xlabel('GQ')
plt.ylabel('Count')
plt.title('GQ Histogram')
plt.bar(het_gq_hist.binEdges[:-1], het_gq_hist.binFrequencies, width=1, color='red', label='heterozygotes')
plt.legend()
plt.show()
plt.xlim(0, 100)
plt.ylim(0, 2500000)
plt.xlabel('GQ')
plt.ylabel('Count')
plt.title('GQ Histogram')
plt.bar(hom_gq_hist.binEdges[:-1], hom_gq_hist.binFrequencies, width=1, label='homozygotes')
plt.legend()
plt.show()
# argmax is the index of the largest value in a list
from numpy import argmax
gq_mode = int(hom_gq_hist.binEdges[argmax(hom_gq_hist.binFrequencies)])
dp_mode = int(dp_hist.binEdges[argmax(dp_hist.binFrequencies)])
print('GQ mode is %d' % gq_mode)
print('DP mode is %d' % dp_mode)
gq_dp_hist = vds.query_genotypes('gs.filter(g => !g.isHet).map(g => g.gq / g.dp).hist(1, 7, 12)')
plt.xlim(0, 8)
plt.ylim(0, 20000000)
plt.xlabel('GQ/DP')
plt.ylabel('Frequency')
plt.title('GQ / DP Histogram')
plt.bar(gq_dp_hist.binEdges[:-1], gq_dp_hist.binFrequencies, width=0.5, label='homozygote GQ/DP')
plt.legend()
plt.show()
from scipy.stats import binom
from numpy import log10
def binomial_p(num_reads):
return binom(num_reads, 0.5).pmf(0)
# phred scaling is -10 * log10(x)
def phred_scale(x):
return -10 * log10(x)
xs = xrange(30)
ys = [phred_scale(binomial_p(x)) for x in xs]
observed_ratio = [3 * x for x in xs]
plt.scatter(xs, ys, label='binomial probability of 0 reads')
plt.plot(xs, map(lambda value: 3 * value, xs), color='k', label='slope=3')
plt.xlim(0, 30)
plt.ylim(0, 100)
plt.xlabel('Reads')
plt.ylabel('phred-scaled GQ')
plt.legend()
plt.show()
true_het_gq = vds.query_genotypes('gs.filter(g => g.isHet).map(g => min(g.pl[0], g.pl[2])).hist(0, 400, 100)')
plt.xlim(0, 400)
plt.ylim(0, 500000)
plt.xlabel('True GQ')
plt.ylabel('Count')
plt.title('True GQ Histogram')
plt.bar(true_het_gq.binEdges[:-1], true_het_gq.binFrequencies, width=4, color='red', label='heterozygotes')
plt.legend()
plt.show()
Fill in the <?>
with code.
[mean_gq, mean_dp] = vds.query_genotypes(['<?>', '<?>'])
print("mean GQ is %f, mean DP is %f" % (mean_gq, mean_dp))
Fill in the <?>
with code.
[gq20, hom_gq20, het_gq20] = vds.query_genotypes(['<?>', '<?>', '<?>'])
print("Total fraction is %f, among homozygotes %f, and heterozygotes %f" % (gq20, hom_gq20, het_gq20))