from hail import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from math import log, isnan

%matplotlib inline

def qqplot(pvals):
    spvals = sorted([x for x in pvals if x and not(isnan(x))])
    exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)]
    obs = [-log(p, 10) for p in spvals]
    plt.scatter(exp, obs)
    plt.plot(np.arange(0, max(max(exp), max(obs))), c="red")
    plt.xlabel("Expected p-value (-log10 scale)")
    plt.ylabel("Observed p-value (-log10 scale)")
    plt.xlim(xmin=0)
    plt.ylim(ymin=0)
    
hc = HailContext()

(hc.balding_nichols_model(3, 2500, 1000, pop_dist = [0.2, 0.3, 0.5], fst = [.05, .09, .11])
 .variant_qc()
 .write('synth.vds', overwrite=True))

synth_vds = hc.read('synth.vds').pca('sa.pca')

pca_table = synth_vds.samples_keytable().to_pandas()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"], alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PC2 vs PC1")
plt.show()

pca_table = synth_vds.samples_keytable().to_pandas()
plt.scatter(pca_table["sa.pca.PC3"], pca_table["sa.pca.PC4"], alpha = .5)
plt.xlabel("PC3")
plt.ylabel("PC4")
plt.title("PC4 vs PC3")
plt.show()

synth_vds.globals._attrs

[ancestral_af, observed_af] = (synth_vds.annotate_variants_expr('va.obs_af = gs.callStats(g => v).AF[1]')
                               .query_variants(['variants.map(v => va.ancestralAF).collect()',
                                                'variants.map(v => va.obs_af).collect()']))
                               
plt.hist(ancestral_af, 25)
plt.show()

plt.hist(observed_af, 25)
plt.show()

Run the below code to randomly generate phenotype based on population, do a GWAS, and make a qq plot¶

pvals = (synth_vds.annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
 .annotate_samples_expr('sa.pheno = rnorm(1, 1) + 2 * sa.cov1 - sa.cov2 + .2 * sa.pop')
 .linreg('sa.pheno', covariates = ['sa.cov1', 'sa.cov2'])
           .query_variants('variants.map(v => va.linreg.pval).collect()'))

qqplot(pvals)

Using the two meaningful PCs as covariates removes the inflation¶

pvals = (synth_vds.annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
 .annotate_samples_expr('sa.pheno = rnorm(1, 1) + 2 * sa.cov1 - sa.cov2 + .2 * sa.pop')
 .linreg('sa.pheno', covariates = ['sa.cov1', 'sa.cov2', 'sa.pca.PC1', 'sa.pca.PC2'])
           .query_variants('variants.map(v => va.linreg.pval).collect()'))

qqplot(pvals)

Another model: generate true betas and true polygenic risk scores¶

(synth_vds.annotate_variants_expr('va.true_beta = rnorm(0, 1)')
 .write('synth_betas.vds', overwrite=True))

vds_with_risk_score = (hc.read('synth_betas.vds')
 .annotate_samples_expr('sa.true_prs = gs.map(g => g.nNonRefAlleles * va.true_beta).sum()'))

prs = vds_with_risk_score.query_samples('samples.map(s => sa.true_prs).collect()')

plt.hist(prs)
plt.show()

Assign case/control status from top 25% of polygenic risk score¶

cutoff = np.percentile(prs, 75)
print('cutoff is %f' % cutoff)

vds_with_risk_score = vds_with_risk_score.annotate_samples_expr('sa.isCase = sa.true_prs > %f' % cutoff)

Don't use PCs as covariates¶

gwas_vds = (vds_with_risk_score
            .annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
            .logreg('wald', 'sa.isCase', ['sa.cov1', 'sa.cov2']))

pvals = gwas_vds.query_variants('variants.map(v => va.logreg.pval).collect()')
qqplot(pvals)
plt.show()

estimated_prs = (gwas_vds.annotate_samples_expr('sa.estimated_prs = gs.map(g => g.nNonRefAlleles * va.logreg.beta).sum()')
                 .query_samples('samples.map(s => sa.estimated_prs).collect()'))

plt.scatter(prs, estimated_prs)
plt.xlabel("True Polygenic Risk Score")
plt.ylabel("Predicted Polygenic Risk Score")

plt.show()

Use PCs as covariates¶

gwas_vds = (vds_with_risk_score
            .annotate_samples_expr(['sa.cov1 = rnorm(0, 1)', 'sa.cov2 = rnorm(0, 1)'])
            .logreg('wald', 'sa.isCase', ['sa.cov1', 'sa.cov2', 'sa.pca.PC1', 'sa.pca.PC2']))

pvals = gwas_vds.query_variants('variants.map(v => va.logreg.pval).collect()')
qqplot(pvals)
plt.show()

estimated_prs = (gwas_vds.annotate_samples_expr('sa.estimated_prs = gs.map(g => g.nNonRefAlleles * va.logreg.beta).sum()')
                 .query_samples('samples.map(s => sa.estimated_prs).collect()'))

plt.scatter(prs, estimated_prs)
plt.xlabel("True Polygenic Risk Score")
plt.ylabel("Predicted Polygenic Risk Score")
plt.show()

[true_betas, beta_hats] = gwas_vds.query_variants(['variants.map(v => va.true_beta).collect()',
                                                  'variants.map(v => va.logreg.beta).collect()'])

plt.scatter(true_betas, beta_hats)
plt.xlabel('True Beta')
plt.ylabel('Estimated Beta')
plt.show()