## LDSC Practical 1 - SNP Heritability
## MadhurBain Singh, Benjamin Neale
## March 5th, 2025

## Qualtrics Link
# https://qimr.az1.qualtrics.com/jfe/form/SV_29tZDm8QmlN31Q2 


## Libraries
library(data.table)  ## to read in large files faster with `fread()`
library(GenomicSEM)  ## for ldsc

## Set the appropriate directory for this session
# setwd("/home/practicals/3.1.SNPHeritability_MichelNivard/final/")
setwd("~/practicals/3.1.SNPHeritability_MichelNivard/final/")


# Set directories ===========================================
sumstat_dir  <- "EUR/"
ref_data_dir <- "EUR/eur_w_ld_chr/"

# BMI sum stats from the GIANT Consortium
# https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files


# HapMap3 SNPs and LD scores in European ancestry ==================


#### HapMap3 SNPs ======================================

hm3_file <- paste0(ref_data_dir,"w_hm3.snplist")
## Take a look at it
## Tip: system() sends the command to the Terminal instead of R Console

system(paste0("head ",hm3_file))
## has information on SNP ID, allele 1, allele 2.

system(paste0("wc -l ",hm3_file))
## This command tells how many lines (= rows) there are in your file.
## Note that the number of lines also includes the line with column names.


#### Question
#### How many SNPs are there in the HapMap3 reference file?


#### QC filters for LDSC ===============================
info_cutoff <- 0.9
maf_cutoff  <- 0.01
## If this information is available in the sum stats, 
## we may additionally apply these filters.


# BMI ===================================================================

## For the tutorial, we will practice using munge() with only Chr 22.

## Read in the file (This is a subset of genome-wide sum stats file)
bmi_file <- paste0(sumstat_dir, "CHR22only_Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt.gz")

system(paste0("zcat ",bmi_file," | head"))
## Ignore the warning: "gzip: stdout: Broken pipe"
## Note: Verify the "effect allele" with respect to which the association statistic is signed.


#### Rename columns as needed =========================

## Renaming is not essential, as LDSC will usually interpret standard columns appropriately.
## However, the sum stats may not always have standard column names.
## In such a case, you may need to rename the columns.

bmi_GWAS <- fread(bmi_file, header = T, data.table = F)
str(bmi_GWAS)

bmi_GWAS <- dplyr::rename(
  ## The data object
  bmi_GWAS, 
  ## Variables to rename: New name = Old name
  A1 = Tested_Allele,
  A2 = Other_Allele
)

## check
head(bmi_GWAS)

## Save the updated file
bmi_updated_file <- paste0(sumstat_dir, "BMI_GWAS_for_LDSC.txt")
fwrite(bmi_GWAS, file = bmi_updated_file, sep = "\t", 
       col.names = T, row.names = T, quote = F)

## verify
system(paste0("head ",bmi_updated_file))


## Note: If the sample size is not available by SNP (in the GWAS sum stats file),
## find the GWAS samples size in the manuscript, and specify in the munge() function below.


### Munge ==========================

## Provide the file path and name to save the munged sum stats
bmi_munged <- paste0(sumstat_dir,"munged_BMI_GWAS_chr22_for_LDSC")      ## File Name

munge(
  files = bmi_updated_file,     ## File path of the raw sum stats
  hm3 = hm3_file,               ## File path of the HapMap3 SNP list
  trait.names = bmi_munged,     ## File path of the munged data (i.e., the output of this function)
  # N = ,                       ## We do **not** specify the sample size here if the sum stats have SNP-specific N
  info.filter = info_cutoff,    ## We may skip this, as there is no INFO column in our sum stats
  maf.filter = maf_cutoff
)

## Examine the log file (also printed in the R console)


### LDSC ========================================

## We have already munged the full sum stats file (for all chromosomes)

## Raw sum stats
raw_bmi    <- paste0(sumstat_dir,"Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt.gz")
## Munged sum stats
munged_bmi <- paste0(sumstat_dir,"Munged_BMI_Meta-analysis_Locke_et_al+UKBiobank_2018_for_LDSC.sumstats.gz")

## Examine how it differs from the raw data
## What columns were there in the raw sum stats?
system(paste0("zcat ",raw_bmi," | head"))
# Note: Ignore the warning "gzip: stdout: Broken pipe"
## What columns are there in the munged sum stats?
system(paste0("zcat ",munged_bmi," | head"))

## How many SNPs were there in the raw sum stats
system(paste0("zcat ",raw_bmi," | wc -l"))
## How many SNPs are there in the munged sum stats
system(paste0("zcat ",munged_bmi," | wc -l"))

## Where do you want the output saved?
bmi_ldsc_out <- paste0(sumstat_dir,"BMI")

ldsc(
  traits = munged_bmi,         ## File path of the munged sum stats
  sample.prev = NA,            ## For continuous traits, sample prevalence is not applicable (set as NA)
  population.prev = NA,        ## For continuous traits, population prevalence is not applicable (set as NA)
  ld = ref_data_dir,           ## The directory with LD scores
  wld = ref_data_dir,          ## The directory with weights for LD scores - the same directory
  trait.names = "BMI",         ## Give an appropriate trait name for the output
  ldsc.log = bmi_ldsc_out      ## File path to save the output log
)

## Examine the output (also saved as a log file you specified)

## Note: Safe to ignore the Warning message:
## Our version of ldsc requires 2 or more traits. Please include an additional trait.

## This funciton in GenomicSEM is written for bivariate LDSC
## But we are using it for univariate LDSC in this practical.


# BONUS: BMI GWAS with LMM ========================================================

## BMI sum stats from Pan-UKBB - using LMM (linear mixed models) with SAIGE

## Already munged for the practical

munged_lmm_bmi <- paste0(sumstat_dir,"BMI.sumstats.gz")  

system(paste0("zcat ",munged_lmm_bmi," | head"))

## Here, N is the raw sample size (not included in the original sum stats file)

## Where do you want the output saved?
bmi_lmm_ldsc_out <- paste0(sumstat_dir,"BMI_LMM")

## Run LDSC
ldsc(
  traits = munged_lmm_bmi,     ## File path of the munged sum stats
  sample.prev = NA,            ## For continuous traits, sample prevalence is not applicable (set as NA)
  population.prev = NA,        ## For continuous traits, population prevalence is not applicable (set as NA)
  ld = ref_data_dir,           ## The directory with LD scores
  wld = ref_data_dir,          ## The directory with weights for LD scores - the same directory
  trait.names = "BMI_lmm",     ## Give an appropriate trait name for the output
  ldsc.log = bmi_lmm_ldsc_out  ## File path to save the output log
)


# END . ======================================================================