In [ ]:
from hail import *
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn
from pprint import pprint
In [ ]:
hc = HailContext()
In [ ]:
vds = hc.read('hail-practical/1kg.vds')

Add in the ancestry annotations

In [ ]:
vds = vds.annotate_samples_table('hail-practical/masked_ancestry.txt', 'Sample', 
                                 code='sa.ancestry = table.Ancestry', 
                                 config=TextTableConfig(impute=True))
vds.query_samples('samples.map(s => sa.ancestry).counter()')

Notice that about 10% of the samples have "UNKWN" ancestry.

We are going to assign ancestry from their genomes with Hail!

The first step is Principal Component Analysis (PCA). Filter down to LD-independent sites before running PCA on all samples:

In [ ]:
pca = (vds.filter_variants_intervals(IntervalTree.read('hail-practical/purcell5k.interval_list'))
          .pca(scores='sa.pca'))

Generate PC plots!

In [ ]:
pca_table = pca.samples_keytable().to_pandas()
# Feel free to change this if your favorite color isn't present!
colors = {'AFR': 'green', 'AMR': 'yellow', 'EAS': 'orange', 'EUR': 'blue', 'SAS': 'cyan', "UNKWN": 'black'}
In [ ]:
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PC2 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
In [ ]:
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC3"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC3")
plt.title("PC3 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()
In [ ]:
plt.scatter(pca_table["sa.pca.PC2"], pca_table["sa.pca.PC4"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC2")
plt.ylabel("PC4")
plt.title("PC4 vs PC2")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()

We can use the expression language to fill in unknown ancestry based on the plots. We filled in the first conditional, using PC1 to identify African ancestry. See how many samples you can identify!

Below, fill in the <?> with some more conditionals (else if ...). If you add several else if branches, you can do pretty well!

In [ ]:
test = pca.annotate_samples_expr('''sa.predicted_ancestry = if (sa.ancestry != "UNKWN") sa.ancestry
                           else if (sa.pca.PC1 > 0.10) "AFR"
                           <?>
                           else "UNKWN"
                          ''')

Run the code below to calculate the differences between predicted_ancestry and true unmasked ancestry

In [ ]:
test2 = test.annotate_samples_table('unmasked_ancestry.txt', 'Sample', code='sa.true_ancestry = table.Ancestry')
c = test2.query_samples('''samples.filter(s => sa.predicted_ancestry != sa.true_ancestry)
                                  .map(s => {label: sa.predicted_ancestry, truth: sa.true_ancestry})
                                  .counter()''')
pprint(c)
print('YOUR SCORE: %d' % sum(c.values()))

Which ancestry group(s) are the hardest to unmask? Why?

In [ ]: