from hail import *
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn
from pprint import pprint
hc = HailContext()
vds = hc.read('hail-practical/1kg.vds')
vds = vds.annotate_samples_table('hail-practical/masked_ancestry.txt', 'Sample',
code='sa.ancestry = table.Ancestry',
config=TextTableConfig(impute=True))
vds.query_samples('samples.map(s => sa.ancestry).counter()')
pca = (vds.filter_variants_intervals(IntervalTree.read('hail-practical/purcell5k.interval_list'))
.pca(scores='sa.pca'))
pca_table = pca.samples_keytable().to_pandas()
# Feel free to change this if your favorite color isn't present!
colors = {'AFR': 'green', 'AMR': 'yellow', 'EAS': 'orange', 'EUR': 'blue', 'SAS': 'cyan', "UNKWN": 'black'}
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PC2 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC3"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC3")
plt.title("PC3 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
plt.scatter(pca_table["sa.pca.PC2"], pca_table["sa.pca.PC4"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC2")
plt.ylabel("PC4")
plt.title("PC4 vs PC2")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
<?>
with some more conditionals (else if ...
). If you add several else if
branches, you can do pretty well!¶test = pca.annotate_samples_expr('''sa.predicted_ancestry = if (sa.ancestry != "UNKWN") sa.ancestry
else if (sa.pca.PC1 > 0.10) "AFR"
<?>
else "UNKWN"
''')
test2 = test.annotate_samples_table('unmasked_ancestry.txt', 'Sample', code='sa.true_ancestry = table.Ancestry')
c = test2.query_samples('''samples.filter(s => sa.predicted_ancestry != sa.true_ancestry)
.map(s => {label: sa.predicted_ancestry, truth: sa.true_ancestry})
.counter()''')
pprint(c)
print('YOUR SCORE: %d' % sum(c.values()))