{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from hail import *\n", "import matplotlib.pyplot as plt\n", "import matplotlib.patches as mpatches\n", "import seaborn\n", "from pprint import pprint" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "hc = HailContext()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "vds = hc.read('hail-practical/1kg.vds')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Add in the ancestry annotations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "vds = vds.annotate_samples_table('hail-practical/masked_ancestry.txt', 'Sample', \n", " code='sa.ancestry = table.Ancestry', \n", " config=TextTableConfig(impute=True))\n", "vds.query_samples('samples.map(s => sa.ancestry).counter()')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Notice that about 10% of the samples have \"UNKWN\" ancestry. \n", "\n", "## We are going to assign ancestry from their genomes with Hail!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The first step is Principal Component Analysis (PCA). Filter down to LD-independent sites before running PCA on all samples:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pca = (vds.filter_variants_intervals(IntervalTree.read('hail-practical/purcell5k.interval_list'))\n", " .pca(scores='sa.pca'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generate PC plots!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pca_table = pca.samples_keytable().to_pandas()\n", "# Feel free to change this if your favorite color isn't present!\n", "colors = {'AFR': 'green', 'AMR': 'yellow', 'EAS': 'orange', 'EUR': 'blue', 'SAS': 'cyan', \"UNKWN\": 'black'}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.scatter(pca_table[\"sa.pca.PC1\"], pca_table[\"sa.pca.PC2\"], c = pca_table[\"sa.ancestry\"].map(colors), alpha = .5)\n", "plt.xlabel(\"PC1\")\n", "plt.ylabel(\"PC2\")\n", "plt.title(\"PC2 vs PC1\")\n", "legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]\n", "plt.legend(handles=legend_entries, loc=2)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.scatter(pca_table[\"sa.pca.PC1\"], pca_table[\"sa.pca.PC3\"], c = pca_table[\"sa.ancestry\"].map(colors), alpha = .5)\n", "plt.xlabel(\"PC1\")\n", "plt.ylabel(\"PC3\")\n", "plt.title(\"PC3 vs PC1\")\n", "legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]\n", "plt.legend(handles=legend_entries, loc=2)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.scatter(pca_table[\"sa.pca.PC2\"], pca_table[\"sa.pca.PC4\"], c = pca_table[\"sa.ancestry\"].map(colors), alpha = .5)\n", "plt.xlabel(\"PC2\")\n", "plt.ylabel(\"PC4\")\n", "plt.title(\"PC4 vs PC2\")\n", "legend_entries = [mpatches.Patch(color= c, label=pheno) for pheno, c in colors.items()]\n", "plt.legend(handles=legend_entries, loc=2)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## We can use the expression language to fill in unknown ancestry based on the plots. We filled in the first conditional, using PC1 to identify African ancestry. See how many samples you can identify!\n", "\n", "## Below, fill in the `` with some more conditionals (`else if ...`). If you add several `else if` branches, you can do pretty well!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "test = pca.annotate_samples_expr('''sa.predicted_ancestry = if (sa.ancestry != \"UNKWN\") sa.ancestry\n", " else if (sa.pca.PC1 > 0.10) \"AFR\"\n", " \n", " else \"UNKWN\"\n", " ''')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run the code below to calculate the differences between predicted_ancestry and true unmasked ancestry" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "test2 = test.annotate_samples_table('unmasked_ancestry.txt', 'Sample', code='sa.true_ancestry = table.Ancestry')\n", "c = test2.query_samples('''samples.filter(s => sa.predicted_ancestry != sa.true_ancestry)\n", " .map(s => {label: sa.predicted_ancestry, truth: sa.true_ancestry})\n", " .counter()''')\n", "pprint(c)\n", "print('YOUR SCORE: %d' % sum(c.values()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Which ancestry group(s) are the hardest to unmask? Why?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }