from hail import *
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn
from pprint import pprint

hc = HailContext()

vds = hc.read('hail-practical/1kg.vds')

Add in the ancestry annotations¶

vds = vds.annotate_samples_table('hail-practical/masked_ancestry.txt', 'Sample', 
                                 code='sa.ancestry = table.Ancestry', 
                                 config=TextTableConfig(impute=True))
vds.query_samples('samples.map(s => sa.ancestry).counter()')

Notice that about 10% of the samples have "UNKWN" ancestry.¶

We are going to assign ancestry from their genomes with Hail!¶

The first step is Principal Component Analysis (PCA). Filter down to LD-independent sites before running PCA on all samples:¶

pca = (vds.filter_variants_intervals(IntervalTree.read('hail-practical/purcell5k.interval_list'))
          .pca(scores='sa.pca'))

Generate PC plots!¶

pca_table = pca.samples_keytable().to_pandas()
# Feel free to change this if your favorite color isn't present!
colors = {'AFR': 'green', 'AMR': 'yellow', 'EAS': 'orange', 'EUR': 'blue', 'SAS': 'cyan', "UNKWN": 'black'}

plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PC2 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()

plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC3"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC3")
plt.title("PC3 vs PC1")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()

plt.scatter(pca_table["sa.pca.PC2"], pca_table["sa.pca.PC4"], c = pca_table["sa.ancestry"].map(colors), alpha = .5)
plt.xlabel("PC2")
plt.ylabel("PC4")
plt.title("PC4 vs PC2")
legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()

We can use the expression language to fill in unknown ancestry based on the plots. We filled in the first conditional, using PC1 to identify African ancestry. See how many samples you can identify!¶

Below, fill in the `<?>` with some more conditionals (`else if ...`). If you add several `else if` branches, you can do pretty well!¶

test = pca.annotate_samples_expr('''sa.predicted_ancestry = if (sa.ancestry != "UNKWN") sa.ancestry
                           else if (sa.pca.PC1 > 0.10) "AFR"
                           <?>
                           else "UNKWN"
                          ''')

Run the code below to calculate the differences between predicted_ancestry and true unmasked ancestry¶

test2 = test.annotate_samples_table('unmasked_ancestry.txt', 'Sample', code='sa.true_ancestry = table.Ancestry')
c = test2.query_samples('''samples.filter(s => sa.predicted_ancestry != sa.true_ancestry)
                                  .map(s => {label: sa.predicted_ancestry, truth: sa.true_ancestry})
                                  .counter()''')
pprint(c)
print('YOUR SCORE: %d' % sum(c.values()))

Which ancestry group(s) are the hardest to unmask? Why?¶

Add in the ancestry annotations¶

Notice that about 10% of the samples have "UNKWN" ancestry.¶

We are going to assign ancestry from their genomes with Hail!¶

The first step is Principal Component Analysis (PCA). Filter down to LD-independent sites before running PCA on all samples:¶

Generate PC plots!¶

We can use the expression language to fill in unknown ancestry based on the plots. We filled in the first conditional, using PC1 to identify African ancestry. See how many samples you can identify!¶

Below, fill in the <?> with some more conditionals (else if ...). If you add several else if branches, you can do pretty well!¶

Run the code below to calculate the differences between predicted_ancestry and true unmasked ancestry¶

Which ancestry group(s) are the hardest to unmask? Why?¶

Below, fill in the `<?>` with some more conditionals (`else if ...`). If you add several `else if` branches, you can do pretty well!¶