library(data.table) library(dplyr) # Question 1.3 # Merge and restrict to the 100 genes listed in gene_names_to_extract.tsv.gz. dt_to_extract <- fread("gene_names_to_extract.tsv.gz", header=FALSE) dt_map <- fread("gene_mapping_to_extract.tsv.gz") dt <- fread(file="dataset_1.tsv.gz", sep='\t') dt_to_extract <- dt_to_extract %>% mutate(gene_name = V1) setkey(dt_to_extract, "gene_name") dt_map <- dt_map %>% rename(gene_id = `Gene stable ID`, gene_name = `Gene name`) setkey(dt_map, "gene_name") dt_genes <- merge(dt_map, dt_to_extract) %>% dplyr::select(gene_id, gene_name) setkey(dt_genes, "gene_id") setkey(dt, "gene_id") # Filter down to the set of genes present in the list passed dt <- merge(dt_genes, dt) # What are the first genes (by gene symbol) alphabetically? setorder(dt, "gene_name") dt$gene_name[1:5] # Question 1.4 # You may assume that the number of cases and controls is fixed for each gene # N cases = 13,933, N controls = 14,422. # Given that information, can you create an R function that uses the two # provided counts, runs a two-sided Fisher's exact test, and provides an odds # ratio and p-value? # Note that you don't need to evaluate this yourself, you can make use of R # functions. Hint ?fisher.test run_fisher_test <- function(case_var, control_var, n_cases=13933, n_controls=14422) { # create a 2x2 table of the inputted numbers mat <- matrix( c(case_var, n_cases - case_var, control_var, n_controls-control_var), nrow=2) # perform a Fisher's exact test f <- fisher.test(mat, alternative='two.sided') # return the result return(f) } # Let's just test whether there is an association between presence of a pLoF # in the gene and case status for each of these 100 genes. # Can you generate a table containing the Gene name, P-value, and OR for each # gene? You can use apply, dplyr, or a loop to do this, whichever you prefer. dt[, c("p_value", "odds_ratio") := { test <- run_fisher_test( ptv_fisher_gnom_non_psych_case_count, ptv_fisher_gnom_non_psych_control_count ) list(test$p.value, test$estimate) }, by = seq_len(nrow(dt))] # Apply per row dt <- dt %>% dplyr::select(gene_id, gene_name, p_value, odds_ratio) # Question 1.5 # Sort the table by P-value from lowest to highest. setorder(dt, p_value) # What are the top 5 rows of your table? head(dt, 5) # Question 1.8 # Can you create a QQ plot of the p-values for these results? # Make sure to plot p-value on a log10 scale. dt <- fread("dataset_2.tsv.gz") plot(rev(-log10(seq(1, nrow(dt))/(nrow(dt)+1))), sort(-log10(dt$plof_p_value)), xlab='Expected -log(P)', ylab = 'Observed -log(P)') abline(0,1,col='red') # Question 1.9 # What happens if you filter to the set of genes for which at least 10 individuals have a variant in the gene? # Consider this separately for 'ptv' and 'damaging missense' variation. # Create a QQ plot again # Does it look more calibrated? dt <- dt %>% filter((plof_fisher_gnom_non_psych_control_count + plof_fisher_gnom_non_psych_case_count) > 10) plot(rev(-log10(seq(1, nrow(dt))/(nrow(dt)+1))), sort(-log10(dt$plof_p_value)), xlab='Expected -log(P)', ylab = 'Observed -log(P)') abline(0,1,col='red') # Let's assume that are bunch more individuals volunteered to be part of this study # and contributed their data. If the ratios of the 2x2 contingency table remained the # same (up to rounding), how many more samples would be required in order to observe an exome-wide significant # signal of association for the top 2 associations that you found? # First find the top two associations - what are they? dt <- fread("dataset_2.tsv.gz") setorder(dt, plof_p_value) setorder(dt, damaging_missense_p_value) # ENSG00000023516, P = 1.150102e-05, OR = Inf # ENSG00000083097, P = 2.221591e-04, OR = 15.5442757 binary_search <- function( case_var, control_var, n_cases=13933, n_controls=14422, p_sig = 0.05/19225) { # Binary search current_guess_lower <- 1 result_lower <- run_fisher_test( round(current_guess_lower * case_var), round(current_guess_lower * control_var), round(current_guess_lower * n_cases), round(current_guess_lower * n_controls) ) current_guess_upper <- 2 result_upper <- run_fisher_test( round(current_guess_upper * case_var), round(current_guess_upper * control_var), round(current_guess_upper * n_cases), round(current_guess_upper * n_controls) ) while (result_upper$`p.value` > p_sig) { current_guess_upper <- 2 * current_guess_upper result_upper <- run_fisher_test( round(current_guess_upper * case_var), round(current_guess_upper * control_var), round(current_guess_upper * n_cases), round(current_guess_upper * n_controls) ) } while ((result_upper$`p.value`)/(result_lower$`p.value`) < 0.999) { current_guess <- (current_guess_upper + current_guess_lower)/2 cat("current guess", current_guess, "\n") result_mid <- run_fisher_test( round(current_guess * case_var), round(current_guess * control_var), round(current_guess * n_cases), round(current_guess * n_controls) ) if (result_mid$`p.value` < p_sig) { current_guess_upper <- current_guess result_upper <- result_mid } else { current_guess_lower <- current_guess result_lower <- result_mid } if ((current_guess_upper - current_guess_lower) < 1e-10) { break } } return(list( scaling = current_guess_upper, case_var = round(current_guess_upper * case_var), control_var = round(current_guess_upper * control_var), n_cases = round(current_guess_upper * n_cases), n_controls = round(current_guess_upper * n_controls), result = result_upper)) } # AKAP11 binary_search(16, 0) # DOP1A binary_search(15, 1) # Stretch goal # Question 1.10 # Another researcher took a look at the data thought that it would be a great idea to also # analyse damaging missense variation. # This seems like a good idea, but would require accounting for double the number of tests # They suggest performing a Cauchy combination test, and controlling for the sample number of tests # This seems like black magic. Can you really combine as many tests as you like, not caring about # the effective number of tests? Cheers Cauchy. # Using the code in SAIGE, determine Cauchy combination p-values for each gene. CCT <- function(pvals, weights=NULL) { if (sum(is.na(pvals)) > 0) { stop("Cannot have NAs in the p-values!") } if ((sum(pvals < 0) + sum(pvals > 1)) > 0){ stop("All p-values must be between 0 and 1!") } is.zero <- (sum(pvals == 0) >= 1) is.one <- (sum(pvals == 1) >= 1) if (is.zero) { return(0) } if (is.one) { warning("There are p-values that are exactly 1!") return(min(1, min(pvals)*length(pvals))) } if (is.null(weights)){ weights <- rep(1/length(pvals), length(pvals)) } else if (length(weights) != length(pvals)) { stop("The length of weights should be the same as that of the p-values!") } else if (sum(weights < 0) > 0) { stop("All the weights must be positive!") } else { weights <- weights / sum(weights) } is.small <- pvals < 1e-16 if (sum(is.small) == 0) { cct.stat <- sum(weights * tan((0.5 - pvals)*pi)) } else { cct.stat <- sum((weights[is.small] / pvals[is.small]) / pi) cct.stat <- cct.stat + sum(weights[!is.small] * tan((0.5 - pvals[!is.small]) * pi)) } if (cct.stat > 1e+15) { pval <- (1 / cct.stat) / pi } else { pval <- 1 - pcauchy(cct.stat) } return(pval) } dt <- fread("dataset_2.tsv.gz") dt[, cauchy_p := CCT(c(plof_p_value, damaging_missense_p_value)), by = seq_len(nrow(dt))] # Question 1.11 p <- runif(100) dt <- data.table(p1 = p, p2 = p) dt[, cauchy_p := CCT(c(p1, p2)), by = seq_len(nrow(dt))] # The Cauchy combination column is the same! (you can also see this by looking # at the CCT formula)