## LDSC Practical 1 - SNP Heritability ## MadhurBain Singh, Benjamin Neale ## March 5th, 2025 ## Qualtrics Link # https://qimr.az1.qualtrics.com/jfe/form/SV_29tZDm8QmlN31Q2 ## Libraries library(data.table) ## to read in large files faster with `fread()` library(GenomicSEM) ## for ldsc ## Set the appropriate directory for this session # setwd("/home/practicals/3.1.SNPHeritability_MichelNivard/final/") setwd("~/practicals/3.1.SNPHeritability_MichelNivard/final/") # Set directories =========================================== sumstat_dir <- "EUR/" ref_data_dir <- "EUR/eur_w_ld_chr/" # BMI sum stats from the GIANT Consortium # https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files # HapMap3 SNPs and LD scores in European ancestry ================== #### HapMap3 SNPs ====================================== hm3_file <- paste0(ref_data_dir,"w_hm3.snplist") ## Take a look at it ## Tip: system() sends the command to the Terminal instead of R Console system(paste0("head ",hm3_file)) ## has information on SNP ID, allele 1, allele 2. system(paste0("wc -l ",hm3_file)) ## This command tells how many lines (= rows) there are in your file. ## Note that the number of lines also includes the line with column names. #### Question #### How many SNPs are there in the HapMap3 reference file? #### QC filters for LDSC =============================== info_cutoff <- 0.9 maf_cutoff <- 0.01 ## If this information is available in the sum stats, ## we may additionally apply these filters. # BMI =================================================================== ## For the tutorial, we will practice using munge() with only Chr 22. ## Read in the file (This is a subset of genome-wide sum stats file) bmi_file <- paste0(sumstat_dir, "CHR22only_Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt.gz") system(paste0("zcat ",bmi_file," | head")) ## Ignore the warning: "gzip: stdout: Broken pipe" ## Note: Verify the "effect allele" with respect to which the association statistic is signed. #### Rename columns as needed ========================= ## Renaming is not essential, as LDSC will usually interpret standard columns appropriately. ## However, the sum stats may not always have standard column names. ## In such a case, you may need to rename the columns. bmi_GWAS <- fread(bmi_file, header = T, data.table = F) str(bmi_GWAS) bmi_GWAS <- dplyr::rename( ## The data object bmi_GWAS, ## Variables to rename: New name = Old name A1 = Tested_Allele, A2 = Other_Allele ) ## check head(bmi_GWAS) ## Save the updated file bmi_updated_file <- paste0(sumstat_dir, "BMI_GWAS_for_LDSC.txt") fwrite(bmi_GWAS, file = bmi_updated_file, sep = "\t", col.names = T, row.names = T, quote = F) ## verify system(paste0("head ",bmi_updated_file)) ## Note: If the sample size is not available by SNP (in the GWAS sum stats file), ## find the GWAS samples size in the manuscript, and specify in the munge() function below. ### Munge ========================== ## Provide the file path and name to save the munged sum stats bmi_munged <- paste0(sumstat_dir,"munged_BMI_GWAS_chr22_for_LDSC") ## File Name munge( files = bmi_updated_file, ## File path of the raw sum stats hm3 = hm3_file, ## File path of the HapMap3 SNP list trait.names = bmi_munged, ## File path of the munged data (i.e., the output of this function) # N = , ## We do **not** specify the sample size here if the sum stats have SNP-specific N info.filter = info_cutoff, ## We may skip this, as there is no INFO column in our sum stats maf.filter = maf_cutoff ) ## Examine the log file (also printed in the R console) ### LDSC ======================================== ## We have already munged the full sum stats file (for all chromosomes) ## Raw sum stats raw_bmi <- paste0(sumstat_dir,"Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt.gz") ## Munged sum stats munged_bmi <- paste0(sumstat_dir,"Munged_BMI_Meta-analysis_Locke_et_al+UKBiobank_2018_for_LDSC.sumstats.gz") ## Examine how it differs from the raw data ## What columns were there in the raw sum stats? system(paste0("zcat ",raw_bmi," | head")) # Note: Ignore the warning "gzip: stdout: Broken pipe" ## What columns are there in the munged sum stats? system(paste0("zcat ",munged_bmi," | head")) ## How many SNPs were there in the raw sum stats system(paste0("zcat ",raw_bmi," | wc -l")) ## How many SNPs are there in the munged sum stats system(paste0("zcat ",munged_bmi," | wc -l")) ## Where do you want the output saved? bmi_ldsc_out <- paste0(sumstat_dir,"BMI") ldsc( traits = munged_bmi, ## File path of the munged sum stats sample.prev = NA, ## For continuous traits, sample prevalence is not applicable (set as NA) population.prev = NA, ## For continuous traits, population prevalence is not applicable (set as NA) ld = ref_data_dir, ## The directory with LD scores wld = ref_data_dir, ## The directory with weights for LD scores - the same directory trait.names = "BMI", ## Give an appropriate trait name for the output ldsc.log = bmi_ldsc_out ## File path to save the output log ) ## Examine the output (also saved as a log file you specified) ## Note: Safe to ignore the Warning message: ## Our version of ldsc requires 2 or more traits. Please include an additional trait. ## This funciton in GenomicSEM is written for bivariate LDSC ## But we are using it for univariate LDSC in this practical. # BONUS: BMI GWAS with LMM ======================================================== ## BMI sum stats from Pan-UKBB - using LMM (linear mixed models) with SAIGE ## Already munged for the practical munged_lmm_bmi <- paste0(sumstat_dir,"BMI.sumstats.gz") system(paste0("zcat ",munged_lmm_bmi," | head")) ## Here, N is the raw sample size (not included in the original sum stats file) ## Where do you want the output saved? bmi_lmm_ldsc_out <- paste0(sumstat_dir,"BMI_LMM") ## Run LDSC ldsc( traits = munged_lmm_bmi, ## File path of the munged sum stats sample.prev = NA, ## For continuous traits, sample prevalence is not applicable (set as NA) population.prev = NA, ## For continuous traits, population prevalence is not applicable (set as NA) ld = ref_data_dir, ## The directory with LD scores wld = ref_data_dir, ## The directory with weights for LD scores - the same directory trait.names = "BMI_lmm", ## Give an appropriate trait name for the output ldsc.log = bmi_lmm_ldsc_out ## File path to save the output log ) # END . ======================================================================