## LDSC Practical 2 - Genetic Correlation ## MadhurBain Singh, Benjamin Neale ## March 5th, 2025 ## Adapted from Andrew Grotzinger, March 2023 # Qualtrics: https://qimr.az1.qualtrics.com/jfe/form/SV_781CgwvIn2YqyqO library(GenomicSEM) library(corrplot) ## Optional - for making heatmaps ## Directory for this session setwd("~/practicals/3.2.GeneticCorrelation_MadhurSingh/final/") ## In this session, we will analyze data from chr 1 only. # PART 1: Continuous Traits Example in European Ancestry ============================================ setwd("EUR") #### HapMap3 SNPs ====================================== ## As before, we will restrict the analyses to high-quality SNPs in the GWAS sum stats ## Which also overlap with the SNPs we have pre-computed reference LD scores. ## Here, LD scores have been computed using HapMap3 SNPs. hm3 <- "eur_w_ld_chr/w_hm3.snplist" ## Take a look at the SNP list ## Tip: system() sends the command to the Terminal instead of R Console system(paste0("head ", hm3)) ## has information on SNP ID, allele 1, allele 2. ## The munge() will also align the alleles across the GWAS sum stats and the reference data. system(paste0("wc -l ", hm3)) ## This command tells how many lines (= rows) there are in your file. ## Note that the number of lines also includes the line with column names. #### How many SNPs are there in the HapMap3 reference file? ## 1217311 HM3 SNPs in the reference data ##### Step 1: Munge the data #### ## Look at the sum stats and the columns in these files ## 1. BMI system("head GIANT_UKB_BMI_EUR_chr1.txt") ## 2. Height system("head Yengo_Height_EUR_chr1.txt") ## Note that both files include sample size ## So, we'd not specify N in the munge() step. ## For multivariate LDSC, we specify a vector of the file paths with GWAS sum stats. files <- c("GIANT_UKB_BMI_EUR_chr1.txt", "Yengo_Height_EUR_chr1.txt") ## Name the traits ## The order should match the order of file paths. trait.names <- c("BMI_chr1", "Height_chr1") #define the imputation quality filter info.filter = 0.9 #define the MAF filter maf.filter = 0.01 #run munge munge( files = files, hm3 = hm3, trait.names = trait.names, # N = N, ## Not specifying N here. info.filter = info.filter, maf.filter = maf.filter ) ## This function will perform munge separately for each trait. ## Examine that the columns were interpreted correctly for both files. ## SNP, A1 (effect allele), A2 (other allele), Beta/Effect size, P value, N, MAF ##### Step 2: run LDSC #### ## Examine the munged sum stats system("zcat BMI.sumstats.gz | head") system("zcat Height.sumstats.gz | head") # Note: Ignore the warning "gzip: stdout: Broken pipe" # traits = the name of the .sumstats.gz traits produced by munge traits <- c("BMI.sumstats.gz", "Height.sumstats.gz") ## ld = folder of LD scores ld <- "eur_w_ld_chr/" # wld = the same folder as LD scores wld <- "eur_w_ld_chr/" # sample.prev = the proportion of cases to total sample size. For quantitative traits list NA sample.prev <- c(NA, NA) # population.prev = the population lifetime prevalence of the traits. For quantitative traits list NA population.prev <- c(NA, NA) # trait.names = optional fifth argument to list trait names so they can be named in your model # follow the same order as the munged files in "traits" trait.names <- c("BMI", "Height") LDSC_EUR <- ldsc( traits = traits, sample.prev = sample.prev, population.prev = population.prev, ld = ld, wld = wld, trait.names = trait.names ) ## Save for future use save(LDSC_EUR, file = "LDSC_EUR.RData") ## The log/output has three main parts: ## [1/3] LDSC analyses of Trait 1 (h2_SNP) ## [2/3] Cross-trait LDSC analyses of Trait 1 and 2 (r_g) ## [3/3] LDSC analyses of Trait 2 (h2_SNP) # Note that the heritability estimates and univariate LDSC intercepts obtained in univariate analyses # will often be slightly different from heritability estimates and intercepts in bivariate analyses # This is because bivariate analysis uses the intersection of the SNPs in the two GWAS sum stats. # Univariate analysis uses all of the SNPs in the respective GWAS sum stats. # Usually these two sets of SNPs will be slightly different, causing small differences in the estimates. #genetic covariance matrix LDSC_EUR$S #genetic correlation matrix cov2cor(LDSC_EUR$S) #lets grab the standard errors from the V matrix k <- nrow(LDSC_EUR$S) SE <- matrix(0, k, k) SE[lower.tri(SE, diag = TRUE)] <- sqrt(diag(LDSC_EUR$V)) SE #matrix of Z-stats Z <- LDSC_EUR$S / SE Z #matrix of p-values P <- 2 * pnorm(Z, lower.tail = FALSE) P ##### Step 3 (Optional): create rg heatmap #### rownames(LDSC_EUR$S) <- colnames(LDSC_EUR$S) corrplot( corr = cov2cor(LDSC_EUR$S), method = "color", addCoef.col = "dark grey", add = F, bg = "white", diag = T, outline = T, mar = c(0, 0, 2, 0), number.cex = 2, cl.pos = "b", cl.ratio = 0.125, cl.align.text = "l", cl.offset = 0.2, tl.srt = 45, tl.pos = "lt", tl.offset = 0.2, tl.col = "black", pch.col = "white", addgrid.col = "black", xpd = T, tl.cex = 1.3, is.corr = TRUE, title = "European" ) rm(list = ls()) # PART 2: Continuous Traits Example in East Asian Ancestry ========================================== setwd("../EAS") ## Similar steps as above. ## The sum stats and LD scores are from East Asian Ancestry ##### Step 1: Munge the data #### system("head BMI_EAS_chr1.txt") system("head Yengo_Height_EAS_chr1.txt") files <- c("BMI_EAS_chr1.txt", "Yengo_Height_EAS_chr1.txt") #define the reference file being used to align alleles across summary stats #here we are using hapmap3 hm3 <- "eas_ldscores/w_hm3.snplist" system(paste0("head ", hm3)) # Looks the same as before system(paste0("wc -l ", hm3)) # Looks the same as before # Has 1217311 SNPs #name the traits trait.names <- c("BMI", "Height") # list the sample sizes. # Yengo_Height has SNP-specific sample size in the file # but BMI sum stats do not, so we will provide that here. # NA for Height N <- c(163835, NA) #definte the imputation quality filter info.filter = 0.9 #define the MAF filter maf.filter = 0.01 #run munge munge( files = files, hm3 = hm3, trait.names = trait.names, N = N, info.filter = info.filter, maf.filter = maf.filter ) ## Again, examine that the columns were interpreted correctly for both files. ## SNP, A1 (effect allele), A2 (other allele), Beta/Effect size, P value, N, MAF ##### Step 2: run LDSC #### ## Examine the munged sum stats system("zcat BMI.sumstats.gz | head") system("zcat Height.sumstats.gz | head") # Note: Ignore the warning "gzip: stdout: Broken pipe" #traits = the name of the .sumstats.gz traits produced by munge traits <- c("BMI.sumstats.gz", "Height.sumstats.gz") ##ld = folder of LD scores ld <- "eas_ldscores/" #wld = folder of LD scores wld <- "eas_ldscores/" #sample.prev = the proportion of cases to total sample size. For quantitative traits list NA sample.prev <- c(NA, NA) #population.prev = the population lifetime prevalence of the traits. For quantitative traits list NA population.prev <- c(NA, NA) #trait.names = optional fifth argument to list trait names so they can be named in your model trait.names <- c("BMI", "Height") LDSC_EAS <- ldsc( traits = traits, sample.prev = sample.prev, population.prev = population.prev, ld = ld, wld = wld, trait.names = trait.names ) # Save for later use save(LDSC_EAS, file = "LDSC_EAS.RData") #genetic covariance matrix LDSC_EAS$S ## Note here that the diagonal elements are the SNP-h2 ## and the off-diagonal elements are genetic covariance # Compare with EUR load("../EUR/LDSC_EUR.RData") LDSC_EUR$S #genetic correlation matrix cov2cor(LDSC_EAS$S) #standard errors k <- nrow(LDSC_EAS$S) SE <- matrix(0, k, k) SE[lower.tri(SE, diag = TRUE)] <- sqrt(diag(LDSC_EAS$V)) SE #matrix of Z-stats Z <- LDSC_EAS$S / SE Z #matrix of p-values P <- 2 * pnorm(Z, lower.tail = FALSE) P # Note: the off-diagonal element gives the p-value of genetic correlation. ##### Step 3 (Optional): create rg heatmap #### rownames(LDSC_EAS$S) <- colnames(LDSC_EAS$S) corrplot( corr = cov2cor(LDSC_EAS$S), method = "color", addCoef.col = "dark grey", add = F, bg = "white", diag = T, outline = T, mar = c(0, 0, 2, 0), number.cex = 2, cl.pos = "b", cl.ratio = 0.125, cl.align.text = "l", cl.offset = 0.2, tl.srt = 45, tl.pos = "lt", tl.offset = 0.2, tl.col = "black", pch.col = "white", addgrid.col = "black", xpd = T, tl.cex = 1.3, is.corr = TRUE, title = "East Asian" ) rm(list = ls()) # PART 3: Binary (case/control) Traits Example in European Ancestry =============================== setwd("../EUR") ##### Step 1: Munge the data #### system("head SCZ_chr1.txt") system("head BIP_chr1.txt") ## Note that SCZ sum stats include the effective N (Neff) ## BIP sum stats include NEFFDIV2 (which is half of Neff) ## The munge function _can_ identify these different Ns. ## But we must ensure that these are interpreted correctly in the munge step below. files <- c("SCZ_chr1.txt", "BIP_chr1.txt") #define the reference file being used to align alleles across summary stats #here we are using hapmap3 hm3 <- "eur_w_ld_chr/w_hm3.snplist" #name the traits trait.names <- c("SCZ", "BIP") #list the sample sizes. #we write NA here as these files already have Neff as columns N <- c(NA, NA) #define the imputation quality filter info.filter = 0.9 #define the MAF filter maf.filter = 0.01 #run munge munge( files = files, hm3 = hm3, trait.names = trait.names, N = N, info.filter = info.filter, maf.filter = maf.filter ) ## Examine that the columns were interpreted correctly for both traits. ## Ensure that the reported NEFFDIV2 in BIP has been doubled to get Neff. ##### Step 2: run LDSC #### ## Examine the munged sum stats system("zcat SCZ.sumstats.gz | head") system("zcat BIP.sumstats.gz | head") # Note: Ignore the warning "gzip: stdout: Broken pipe" #traits = the name of the .sumstats.gz traits produced by munge traits <- c("SCZ.sumstats.gz", "BIP.sumstats.gz") ##ld = folder of LD scores ld <- "eur_w_ld_chr/" #wld = folder of LD scores wld <- "eur_w_ld_chr/" #sample.prev: enter as 0.5 because used sum of effecitve N, #that accounts for sample ascertainment sample.prev <- c(.5, .5) #population.prev = the population lifetime prevalence of the traits. population.prev <- c(.01, .02) #trait.names = optional fifth argument to list trait names so they can be named in your model trait.names <- c("SCZ", "BIP") LDSC_SCZ_BIP <- ldsc( traits = traits, sample.prev = sample.prev, population.prev = population.prev, ld = ld, wld = wld, trait.names = trait.names ) ## The output first reports the observed-scale h2 and genetic covariance. ## At the end, the liability-scale estimates are reported. ## Save for later use save(LDSC_SCZ_BIP , file = "LDSC_SCZ_BIP.RData") #genetic covariance matrix LDSC_SCZ_BIP$S #genetic correlation matrix cov2cor(LDSC_SCZ_BIP$S) #lets grab the standard errors from the V matrix k <- nrow(LDSC_SCZ_BIP$S) SE <- matrix(0, k, k) SE[lower.tri(SE, diag = TRUE)] <- sqrt(diag(LDSC_SCZ_BIP$V)) #matrix of Z-stats Z <- LDSC_SCZ_BIP$S / SE #matrix of p-values P <- 2 * pnorm(Z, lower.tail = FALSE) ##### Step 3 (Optional): create rg heatmap #### rownames(LDSC_SCZ_BIP$S) <- colnames(LDSC_SCZ_BIP$S) corrplot( corr = cov2cor(LDSC_SCZ_BIP$S), method = "color", addCoef.col = "dark grey", add = F, bg = "white", diag = T, outline = T, mar = c(0, 0, 2, 0), number.cex = 2, cl.pos = "b", cl.ratio = 0.125, cl.align.text = "l", cl.offset = 0.2, tl.srt = 45, tl.pos = "lt", tl.offset = 0.2, tl.col = "black", pch.col = "white", addgrid.col = "black", xpd = T, tl.cex = 1.3, is.corr = TRUE, title = "SCZ / BIP" ) rm(list = ls()) # END. ###########################