from hail import *
import matplotlib.pyplot as plt
import numpy as np
import seaborn
%matplotlib inline
hc = HailContext()
hc.import_vcf('hail-practical/1kg.vcf.bgz').filter_multi().write('hail-practical/1kg.vds', overwrite=True)
vds = hc.read('hail-practical/1kg.vds')
print(vds.variant_schema)
print(vds.sample_schema)
variant_qc
and sample_qc
, these don't actually filter variants or samples.¶vds = vds.variant_qc().sample_qc().persist()
print(vds.variant_schema)
print(vds.sample_schema)
matplotlib
to look at some of these values. Both sa.qc.callRate
and sa.qc.gqMean
may be useful metrics for filtering.¶[call_rate, gq_mean] = vds.query_samples(['samples.map(s => sa.qc.callRate).collect()',
'samples.map(s => sa.qc.dpMean).collect()'])
plt.scatter(gq_mean, call_rate, alpha=0.25)
plt.xlabel('Mean DP')
plt.ylabel('Call Rate')
plt.title('Sample call rate vs mean DP')
plt.show()
[call_rate, gq_mean] = vds.query_variants(['variants.map(v => va.qc.callRate).collect()',
'variants.map(v => va.qc.dpMean).collect()'])
plt.scatter(gq_mean, call_rate, alpha=0.2)
plt.xlabel('Mean DP')
plt.ylabel('Call Rate')
plt.title('Variant call rate vs mean DP')
plt.show()
original_counts = vds.count()
filtered_vds = (vds.filter_variants_expr('va.qc.callRate > 0.95')
.filter_samples_expr('sa.qc.callRate > 0.95'))
new_counts = filtered_vds.count()
print(original_counts)
print(new_counts)
%%sh
head hail-practical/sample_annotations.txt | column -t
filtered_vds = filtered_vds.annotate_samples_table('hail-practical/sample_annotations.txt',
sample_expr='Sample',
root='sa.metadata',
config=TextTableConfig(impute=True))
print(filtered_vds.sample_schema)
filtered_vds = filtered_vds.annotate_variants_table('hail-practical/variant_annotations.txt',
variant_expr='Variant',
code='va.consequence = table.Consequence',
config=TextTableConfig(impute=True))
print(filtered_vds.variant_schema)
filtered_vds.query_variants('variants.map(v => va.consequence).counter()')
filtered_vds.query_samples('samples.map(s => sa.metadata.SuperPopulation).counter()')
Fill in the <?>
with code.
filtered_vds.query_variants('<?>')
sa.metadata.Population == "GBR"
) have purple hair?¶Fill in the <?>
with code.
Hint 1: the fraction of samples that are British is samples.fraction(s => sa.metadata.Population == "GBR")
Hint 2: you need to use ".filter"
filtered_vds.query_samples('<?>')
sa.metadata.SuperPopulation == "SAS"
)?¶Fill in the <?>
with code.
Hint: Remember that .stats()
will compute a variety of useful metrics from a numeric Aggregable.
filtered_vds.query_samples('<?>')