import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from math import log, isnan
from pprint import pprint
import matplotlib.patches as mpatches
from hail import *
%matplotlib inline
def qqplot(pvals):
spvals = sorted([x for x in pvals if x and not(isnan(x))])
exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)]
obs = [-log(p, 10) for p in spvals]
plt.scatter(exp, obs)
plt.plot(np.arange(0, max(max(exp), max(obs))), c="red")
plt.xlabel("Expected p-value (-log10 scale)")
plt.ylabel("Observed p-value (-log10 scale)")
plt.xlim(xmin=0)
plt.ylim(ymin=0)
hc = HailContext()
vds = hc.read('1kg.vds').cache()
GQ is "genotype quality", which is roughly the log-scaled probability that your genotype is called wrong.
GQ 10 means 90% confidence.
GQ 20 means 99% confidence.
GQ 30 means 99.9% confidence.
GQ is truncated at 99, which corresponds to a ~ 0.0000000001 chance that your call is wrong (if it's calibrated!) | If we plot a histogram of GQ values, what will it look like?
DP is "depth", which is the total number of reads for a given sample at a given variant.
hist
functions again here. You definitely can't collect all the GQ values for a full dataset: with 1K Genomes, you'd get 200 billion values back (and you probably don't have 1600 gigabytes of RAM)¶[gq_hist, dp_hist] = vds.query_genotypes(['gs.map(g => g.gq).hist(0, 100, 100)',
'gs.map(g => g.dp).hist(0, 30, 30)'])
plt.xlim(0, 100)
plt.ylim(0, 2500000)
plt.xlabel('GQ')
plt.ylabel('Count')
plt.title('GQ Histogram')
plt.bar(gq_hist.binEdges[:-1], gq_hist.binFrequencies, width=1, label='GQ')
plt.legend()
plt.show()
plt.xlim(0, 30)
plt.ylim(0, 3500000)
plt.xlabel('DP')
plt.ylabel('Count')
plt.title('DP Histogram')
plt.bar(dp_hist.binEdges[:-1], dp_hist.binFrequencies, width=1, label='DP')
plt.legend()
plt.show()
[het_gq_hist, hom_gq_hist] = vds.query_genotypes(['gs.filter(g => g.isHet).map(g => g.gq).hist(0, 100, 100)',
'gs.filter(g => !g.isHet).map(g => g.gq).hist(0, 100, 100)'])
plt.xlim(0, 100)
plt.ylim(0, 2000000)
plt.xlabel('GQ')
plt.ylabel('Count')
plt.title('GQ Histogram')
plt.bar(het_gq_hist.binEdges[:-1], het_gq_hist.binFrequencies, width=1, color='red', label='heterozygotes')
plt.legend()
plt.show()
plt.xlim(0, 100)
plt.ylim(0, 2500000)
plt.xlabel('GQ')
plt.ylabel('Count')
plt.title('GQ Histogram')
plt.bar(hom_gq_hist.binEdges[:-1], hom_gq_hist.binFrequencies, width=1, label='homozygotes')
plt.legend()
plt.show()
# argmax is the index of the largest value in a list
from numpy import argmax
gq_mode = int(hom_gq_hist.binEdges[argmax(hom_gq_hist.binFrequencies)])
dp_mode = int(dp_hist.binEdges[argmax(dp_hist.binFrequencies)])
print('GQ mode is %d' % gq_mode)
print('DP mode is %d' % dp_mode)
gq_dp_hist = vds.query_genotypes('gs.filter(g => !g.isHet).map(g => g.gq / g.dp).hist(1, 7, 12)')
plt.xlim(0, 8)
plt.ylim(0, 20000000)
plt.xlabel('GQ/DP')
plt.ylabel('Frequency')
plt.title('GQ / DP Histogram')
plt.bar(gq_dp_hist.binEdges[:-1], gq_dp_hist.binFrequencies, width=0.5, label='homozygote GQ/DP')
plt.legend()
plt.show()
from scipy.stats import binom
from numpy import log10
def binomial_p(num_reads):
return binom(num_reads, 0.5).pmf(0)
# phred scaling is -10 * log10(x)
def phred_scale(x):
return -10 * log10(x)
xs = xrange(30)
ys = [phred_scale(binomial_p(x)) for x in xs]
observed_ratio = [3 * x for x in xs]
plt.scatter(xs, ys, label='binomial probability of 0 reads')
plt.plot(xs, map(lambda value: 3 * value, xs), color='k', label='slope=3')
plt.xlim(0, 30)
plt.ylim(0, 100)
plt.xlabel('Reads')
plt.ylabel('phred-scaled GQ')
plt.legend()
plt.show()
- 10 * log10(0.5)
true_het_gq = vds.query_genotypes('gs.filter(g => g.isHet).map(g => min(g.pl[0], g.pl[2])).hist(0, 400, 100)')
plt.xlim(0, 400)
plt.ylim(0, 500000)
plt.xlabel('True GQ')
plt.ylabel('Count')
plt.title('True GQ Histogram')
plt.bar(true_het_gq.binEdges[:-1], true_het_gq.binFrequencies, width=4, color='red', label='heterozygotes')
plt.legend()
plt.show()
Fill in the <?>
with code.
[mean_gq, mean_dp] = vds.query_genotypes(['<?>', '<?>'])
print("mean GQ is %f, mean DP is %f" % (mean_gq, mean_dp))
Fill in the <?>
with code.
[gq20, hom_gq20, het_gq20] = vds.query_genotypes(['<?>', '<?>', '<?>'])
print("Total fraction is %f, among homozygotes %f, and heterozygotes %f" % (gq20, hom_gq20, het_gq20))