import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from math import log, isnan
from pprint import pprint
import matplotlib.patches as mpatches
from hail import *
%matplotlib inline
def qqplot(pvals):
spvals = sorted([x for x in pvals if x and not(isnan(x))])
exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)]
obs = [-log(p, 10) for p in spvals]
plt.scatter(exp, obs)
plt.plot(np.arange(0, max(max(exp), max(obs))), c="red")
plt.xlabel("Expected p-value (-log10 scale)")
plt.ylabel("Observed p-value (-log10 scale)")
plt.xlim(xmin=0)
plt.ylim(ymin=0)
hc = HailContext()
vds = hc.read('1kg.vds')
vds = vds.annotate_samples_table('masked_ancestry.txt', 'Sample',
code='sa.ancestry = table.Ancestry',
config=TextTableConfig(impute=True))
vds.query_samples('samples.map(s => sa.ancestry).counter()')
pca = (vds.filter_variants_intervals(IntervalTree.read('purcell5k.interval_list'))
.pca(scores='sa.pca'))
pca_table = pca.samples_keytable().to_pandas()
# Feel free to change this if your favorite color isn't present!
colors = {'AFR': 'green', 'AMR': 'yellow', 'EAS': 'orange', 'EUR': 'blue', 'SAS': 'cyan', "UNKWN": 'black'}
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PC2 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC3"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC3")
plt.title("PC3 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
plt.scatter(pca_table["sa.pca.PC2"], pca_table["sa.pca.PC4"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC2")
plt.ylabel("PC4")
plt.title("PC4 vs PC2")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
<?>
with some more conditionals (else if ...
). If you add several else if
branches, you can do pretty well!¶test = pca.annotate_samples_expr('''sa.predicted_ancestry = if (sa.ancestry != "UNKWN") sa.ancestry
else if (sa.pca.PC1 > 0.10) "AFR"
<?>
else "UNKWN"
''')
test2 = test.annotate_samples_table('unmasked_ancestry.txt', 'Sample', code='sa.true_ancestry = table.Ancestry')
c = test2.query_samples('''samples.filter(s => sa.predicted_ancestry != sa.true_ancestry)
.map(s => {label: sa.predicted_ancestry, truth: sa.true_ancestry})
.counter()''')
pprint(c)
print('YOUR SCORE: %d' % sum(c.values()))