from hail import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from math import log, isnan
%matplotlib inline
def qqplot(pvals):
spvals = sorted([x for x in pvals if x and not(isnan(x))])
exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)]
obs = [-log(p, 10) for p in spvals]
plt.scatter(exp, obs)
plt.plot(np.arange(0, max(max(exp), max(obs))), c="red")
plt.xlabel("Expected p-value (-log10 scale)")
plt.ylabel("Observed p-value (-log10 scale)")
plt.xlim(xmin=0)
plt.ylim(ymin=0)
hc = HailContext()
(hc.balding_nichols_model(3, 2500, 1000, pop_dist = [0.2, 0.3, 0.5], fst = [.05, .09, .11])
.variant_qc()
.write('synth.vds', overwrite=True))
synth_vds = hc.read('synth.vds').pca('sa.pca')
pca_table = synth_vds.samples_keytable().to_pandas()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"], alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PC2 vs PC1")
plt.show()
pca_table = synth_vds.samples_keytable().to_pandas()
plt.scatter(pca_table["sa.pca.PC3"], pca_table["sa.pca.PC4"], alpha = .5)
plt.xlabel("PC3")
plt.ylabel("PC4")
plt.title("PC4 vs PC3")
plt.show()
synth_vds.globals._attrs
[ancestral_af, observed_af] = (synth_vds.annotate_variants_expr('va.obs_af = gs.callStats(g => v).AF[1]')
.query_variants(['variants.map(v => va.ancestralAF).collect()',
'variants.map(v => va.obs_af).collect()']))
plt.hist(ancestral_af, 25)
plt.show()
plt.hist(observed_af, 25)
plt.show()
pvals = (synth_vds.annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
.annotate_samples_expr('sa.pheno = rnorm(1, 1) + 2 * sa.cov1 - sa.cov2 + .2 * sa.pop')
.linreg('sa.pheno', covariates = ['sa.cov1', 'sa.cov2'])
.query_variants('variants.map(v => va.linreg.pval).collect()'))
qqplot(pvals)
pvals = (synth_vds.annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
.annotate_samples_expr('sa.pheno = rnorm(1, 1) + 2 * sa.cov1 - sa.cov2 + .2 * sa.pop')
.linreg('sa.pheno', covariates = ['sa.cov1', 'sa.cov2', 'sa.pca.PC1', 'sa.pca.PC2'])
.query_variants('variants.map(v => va.linreg.pval).collect()'))
qqplot(pvals)
(synth_vds.annotate_variants_expr('va.true_beta = rnorm(0, 1)')
.write('synth_betas.vds', overwrite=True))
vds_with_risk_score = (hc.read('synth_betas.vds')
.annotate_samples_expr('sa.true_prs = gs.map(g => g.nNonRefAlleles * va.true_beta).sum()'))
prs = vds_with_risk_score.query_samples('samples.map(s => sa.true_prs).collect()')
plt.hist(prs)
plt.show()
cutoff = np.percentile(prs, 75)
print('cutoff is %f' % cutoff)
vds_with_risk_score = vds_with_risk_score.annotate_samples_expr('sa.isCase = sa.true_prs > %f' % cutoff)
gwas_vds = (vds_with_risk_score
.annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
.logreg('wald', 'sa.isCase', ['sa.cov1', 'sa.cov2']))
pvals = gwas_vds.query_variants('variants.map(v => va.logreg.pval).collect()')
qqplot(pvals)
plt.show()
estimated_prs = (gwas_vds.annotate_samples_expr('sa.estimated_prs = gs.map(g => g.nNonRefAlleles * va.logreg.beta).sum()')
.query_samples('samples.map(s => sa.estimated_prs).collect()'))
plt.scatter(prs, estimated_prs)
plt.xlabel("True Polygenic Risk Score")
plt.ylabel("Predicted Polygenic Risk Score")
plt.show()
gwas_vds = (vds_with_risk_score
.annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
.logreg('wald', 'sa.isCase', ['sa.cov1', 'sa.cov2', 'sa.pca.PC1', 'sa.pca.PC2']))
pvals = gwas_vds.query_variants('variants.map(v => va.logreg.pval).collect()')
qqplot(pvals)
plt.show()
estimated_prs = (gwas_vds.annotate_samples_expr('sa.estimated_prs = gs.map(g => g.nNonRefAlleles * va.logreg.beta).sum()')
.query_samples('samples.map(s => sa.estimated_prs).collect()'))
plt.scatter(prs, estimated_prs)
plt.xlabel("True Polygenic Risk Score")
plt.ylabel("Predicted Polygenic Risk Score")
plt.show()
[true_betas, beta_hats] = gwas_vds.query_variants(['variants.map(v => va.true_beta).collect()',
'variants.map(v => va.logreg.beta).collect()'])
plt.scatter(true_betas, beta_hats)
plt.xlabel('True Beta')
plt.ylabel('Estimated Beta')
plt.show()