import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from math import log, isnan
from pprint import pprint
import matplotlib.patches as mpatches
from hail import *
%matplotlib inline
def qqplot(pvals):
spvals = sorted([x for x in pvals if x and not(isnan(x))])
exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)]
obs = [-log(p, 10) for p in spvals]
plt.scatter(exp, obs)
plt.plot(np.arange(0, max(max(exp), max(obs))), c="red")
plt.xlabel("Expected p-value (-log10 scale)")
plt.ylabel("Observed p-value (-log10 scale)")
plt.xlim(xmin=0)
plt.ylim(ymin=0)
hc = HailContext()
# dataset already imported to VDS in hail-practical
# hc.import_vcf('1kg.vcf.bgz').filter_multi().variant_qc().sample_qc().write('1kg.vds', overwrite=True)
vds = hc.read('1kg.vds')
variant_qc
this doesn't actually filter variants from our dataset.¶vds = vds.variant_qc()
vds = vds.sample_qc()
vds.persist() # We'll use these results several times, so checkpoint our computations
print(vds.variant_schema)
matplotlib
to look at some of these values. Both sa.qc.callRate
and sa.qc.gqMean
may be useful metrics for filtering.¶[call_rate, gq_mean] = vds.query_samples(['samples.map(s => sa.qc.callRate).collect()',
'samples.map(s => sa.qc.dpMean).collect()'])
plt.scatter(gq_mean, call_rate, alpha=0.25)
plt.xlabel('Mean DP')
plt.ylabel('Call Rate')
plt.title('Sample call rate vs mean DP')
plt.show()
[call_rate, gq_mean] = vds.query_variants(['variants.map(v => va.qc.callRate).collect()',
'variants.map(v => va.qc.dpMean).collect()'])
plt.scatter(gq_mean, call_rate, alpha=0.2)
plt.xlabel('Mean DP')
plt.ylabel('Call Rate')
plt.title('Variant call rate vs mean DP')
plt.show()
original_counts = vds.count()
filtered_vds = (vds.filter_variants_expr('va.qc.callRate > 0.95')
.filter_samples_expr('sa.qc.callRate > 0.95'))
new_counts = filtered_vds.count()
print(original_counts)
print(new_counts)
%%sh
magic runs the contents of the cell as Unix commands.¶%%sh
head sample_annotations.txt | column -t
%%sh
head variant_annotations.txt | column -t
filtered_vds = filtered_vds.annotate_samples_table('sample_annotations.txt',
sample_expr='Sample',
root='sa.metadata',
config=TextTableConfig(impute=True))
filtered_vds = filtered_vds.annotate_variants_table('variant_annotations.txt',
variant_expr='Variant',
code='va.consequence = table.Consequence',
config=TextTableConfig(impute=True))
print('Variant schema:')
print(filtered_vds.variant_schema)
print('Sample schema:')
print(filtered_vds.sample_schema)
filtered_vds.query_variants('variants.map(v => va.consequence).counter()')
filtered_vds.query_samples('samples.map(s => sa.metadata.SuperPopulation).counter()')
Not all base pairs are created equal. A "CG" motif (see wikipedia) has a high probability of mutating to "TG".
On the opposite strand, a "GC" motif is likely to mutate to "AC".
We expect to see many C/T and G/A SNPs in our dataset. Do we?
alleles = vds.query_variants('variants.map(v => v.altAllele).counter()')
from collections import Counter
pprint(Counter(alleles).most_common())
G/A | 15689986
C/T | 15685250
A/G | 10779869
T/C | 10736805
G/T | 3735538
C/A | 3727525
C/G | 3436122
G/C | 3425025
T/G | 2861302
A/C | 2860428
A/T | 2715606
T/A | 2713458
af = vds.query_variants('variants.map(v => va.qc.AF).collect()')
plt.hist(af, bins=250)
plt.xlabel('Alternate allele frequency')
plt.ylabel('Count')
plt.show()
# This will always work!
af_hist = vds.query_variants('variants.map(v => va.qc.AF).hist(0, 1, 250)')
plt.bar(af_hist.binEdges[:-1], af_hist.binFrequencies, width=0.004)
plt.xlim(0, 1)
plt.xlabel('Alternate allele frequency')
plt.ylabel('Count')
plt.show()
filtered_vds
for the next practical.¶