library(data.table)
library(dplyr)

# Question 1.3
# Merge and restrict to the 100 genes listed in gene_names_to_extract.tsv.gz.

dt_to_extract <- fread("gene_names_to_extract.tsv.gz", header=FALSE)
dt_map <- fread("gene_mapping_to_extract.tsv.gz")
dt <- fread(file="dataset_1.tsv.gz", sep='\t')

dt_to_extract <- dt_to_extract %>% mutate(gene_name = V1)
setkey(dt_to_extract, "gene_name")
dt_map <- dt_map %>% rename(gene_id = `Gene stable ID`, gene_name = `Gene name`)
setkey(dt_map, "gene_name")
dt_genes <- merge(dt_map, dt_to_extract) %>% dplyr::select(gene_id, gene_name)

setkey(dt_genes, "gene_id")
setkey(dt, "gene_id")

# Filter down to the set of genes present in the list passed
dt <- merge(dt_genes, dt)

# What are the first genes (by gene symbol) alphabetically?
setorder(dt, "gene_name")
dt$gene_name[1:5]

# Question 1.4
# You may assume that the number of cases and controls is fixed for each gene
# N cases = 13,933, N controls = 14,422.

# Given that information, can you create an R function that uses the two 
# provided counts, runs a two-sided Fisher's exact test, and provides an odds 
# ratio and p-value?
# Note that you don't need to evaluate this yourself, you can make use of R 
# functions. Hint ?fisher.test

run_fisher_test <- function(case_var, control_var, n_cases=13933, n_controls=14422) {
  # create a 2x2 table of the inputted numbers
  mat <- matrix(
    c(case_var, n_cases - case_var, control_var, n_controls-control_var), nrow=2)
  # perform a Fisher's exact test
  f <- fisher.test(mat, alternative='two.sided')
  # return the result
  return(f)
}

# Let's just test whether there is an association between presence of a pLoF
# in the gene and case status for each of these 100 genes.
# Can you generate a table containing the Gene name, P-value, and OR for each 
# gene? You can use apply, dplyr, or a loop to do this, whichever you prefer.

dt[, c("p_value", "odds_ratio") := {
  test <- run_fisher_test(
    ptv_fisher_gnom_non_psych_case_count,
    ptv_fisher_gnom_non_psych_control_count
  )
  list(test$p.value, test$estimate)
}, by = seq_len(nrow(dt))]  # Apply per row
dt <- dt %>% dplyr::select(gene_id, gene_name, p_value, odds_ratio)

# Question 1.5
# Sort the table by P-value from lowest to highest.
setorder(dt, p_value)
# What are the top 5 rows of your table?
head(dt, 5)

# Question 1.8
# Can you create a QQ plot of the p-values for these results?
# Make sure to plot p-value on a log10 scale.
dt <- fread("dataset_2.tsv.gz")
plot(rev(-log10(seq(1, nrow(dt))/(nrow(dt)+1))),
     sort(-log10(dt$plof_p_value)), xlab='Expected -log(P)',
     ylab = 'Observed -log(P)')
abline(0,1,col='red')

# Question 1.9
# What happens if you filter to the set of genes for which at least 10 individuals have a variant in the gene?
# Consider this separately for 'ptv' and 'damaging missense' variation.
# Create a QQ plot again
# Does it look more calibrated?
dt <- dt %>% filter((plof_fisher_gnom_non_psych_control_count +
                       plof_fisher_gnom_non_psych_case_count) > 10)
plot(rev(-log10(seq(1, nrow(dt))/(nrow(dt)+1))),
     sort(-log10(dt$plof_p_value)), xlab='Expected -log(P)',
     ylab = 'Observed -log(P)')
abline(0,1,col='red')

# Let's assume that are bunch more individuals volunteered to be part of this study
# and contributed their data. If the ratios of the 2x2 contingency table remained the 
# same (up to rounding), how many more samples would be required in order to observe an exome-wide significant
# signal of association for the top 2 associations that you found?

# First find the top two associations - what are they?
dt <- fread("dataset_2.tsv.gz")
setorder(dt, plof_p_value)
setorder(dt, damaging_missense_p_value)

# ENSG00000023516, P = 1.150102e-05, OR = Inf
# ENSG00000083097, P = 2.221591e-04, OR = 15.5442757

binary_search <- function(
    case_var, control_var,
    n_cases=13933, n_controls=14422,
    p_sig = 0.05/19225)
{
  # Binary search
  current_guess_lower <- 1
  result_lower <- run_fisher_test(
    round(current_guess_lower * case_var),
    round(current_guess_lower * control_var),
    round(current_guess_lower * n_cases),
    round(current_guess_lower * n_controls)
  )
  
  current_guess_upper <- 2
  result_upper <- run_fisher_test(
    round(current_guess_upper * case_var),
    round(current_guess_upper * control_var),
    round(current_guess_upper * n_cases),
    round(current_guess_upper * n_controls)
  )
  
  while (result_upper$`p.value` > p_sig) {
    current_guess_upper <- 2 * current_guess_upper
    result_upper <- run_fisher_test(
      round(current_guess_upper * case_var),
      round(current_guess_upper * control_var),
      round(current_guess_upper * n_cases),
      round(current_guess_upper * n_controls)
    )
  }
  
  while ((result_upper$`p.value`)/(result_lower$`p.value`) < 0.999)
  {
    current_guess <- (current_guess_upper + current_guess_lower)/2
    cat("current guess", current_guess, "\n")
    result_mid <- run_fisher_test(
      round(current_guess * case_var),
      round(current_guess * control_var),
      round(current_guess * n_cases),
      round(current_guess * n_controls)
    )
    
    if (result_mid$`p.value` < p_sig) {
      current_guess_upper <- current_guess
      result_upper <- result_mid
    } else {
      current_guess_lower <- current_guess
      result_lower <- result_mid
    }
    if ((current_guess_upper - current_guess_lower) < 1e-10) {
      break
    }
  }
  return(list(
    scaling = current_guess_upper,
    case_var = round(current_guess_upper * case_var),
    control_var = round(current_guess_upper * control_var),
    n_cases = round(current_guess_upper * n_cases),
    n_controls = round(current_guess_upper * n_controls),
    result = result_upper))
}

# AKAP11
binary_search(16, 0)
# DOP1A
binary_search(15, 1)

# Stretch goal
# Question 1.10
# Another researcher took a look at the data thought that it would be a great idea to also
# analyse damaging missense variation.

# This seems like a good idea, but would require accounting for double the number of tests

# They suggest performing a Cauchy combination test, and controlling for the sample number of tests
# This seems like black magic. Can you really combine as many tests as you like, not caring about
# the effective number of tests? Cheers Cauchy.
# Using the code in SAIGE, determine Cauchy combination p-values for each gene.

CCT <- function(pvals, weights=NULL)
{
  if (sum(is.na(pvals)) > 0) {
    stop("Cannot have NAs in the p-values!")
  }
  
  if ((sum(pvals < 0) + sum(pvals > 1)) > 0){
    stop("All p-values must be between 0 and 1!")
  }
  
  is.zero <- (sum(pvals == 0) >= 1)
  is.one <- (sum(pvals == 1) >= 1)
  
  if (is.zero) { return(0) }
  
  if (is.one) {
    warning("There are p-values that are exactly 1!")
    return(min(1, min(pvals)*length(pvals)))
  }
  
  if (is.null(weights)){
    weights <- rep(1/length(pvals), length(pvals))
  } else if (length(weights) != length(pvals)) {
    stop("The length of weights should be the same as that of the p-values!")
  } else if (sum(weights < 0) > 0) {
    stop("All the weights must be positive!")
  } else {
    weights <- weights / sum(weights)
  }
  
  is.small <- pvals < 1e-16
  if (sum(is.small) == 0) {
    cct.stat <- sum(weights * tan((0.5 - pvals)*pi))
  } else {
    cct.stat <- sum((weights[is.small] / pvals[is.small]) / pi)
    cct.stat <- cct.stat + 
      sum(weights[!is.small] * tan((0.5 - pvals[!is.small]) * pi))
  }
  
  if (cct.stat > 1e+15) {
    pval <- (1 / cct.stat) / pi
  } else {
    pval <- 1 - pcauchy(cct.stat)
  }
  return(pval)
}

dt <- fread("dataset_2.tsv.gz")
dt[, cauchy_p := CCT(c(plof_p_value, damaging_missense_p_value)), by = seq_len(nrow(dt))]

# Question 1.11
p <- runif(100)
dt <- data.table(p1 = p, p2 = p)
dt[, cauchy_p := CCT(c(p1, p2)), by = seq_len(nrow(dt))]
# The Cauchy combination column is the same! (you can also see this by looking
# at the CCT formula)