## LDSC Practical 2 - Genetic Correlation
## MadhurBain Singh, Benjamin Neale
## March 5th, 2025
## Adapted from Andrew Grotzinger, March 2023

# Qualtrics: https://qimr.az1.qualtrics.com/jfe/form/SV_781CgwvIn2YqyqO 

library(GenomicSEM)
library(corrplot)     ## Optional - for making heatmaps

## Directory for this session
setwd("~/practicals/3.2.GeneticCorrelation_MadhurSingh/final/")

## In this session, we will analyze data from chr 1 only.

# PART 1: Continuous Traits Example in European Ancestry ============================================

setwd("EUR")

#### HapMap3 SNPs ======================================

## As before, we will restrict the analyses to high-quality SNPs in the GWAS sum stats
## Which also overlap with the SNPs we have pre-computed reference LD scores. 
## Here, LD scores have been computed using HapMap3 SNPs.

hm3 <- "eur_w_ld_chr/w_hm3.snplist"
## Take a look at the SNP list
## Tip: system() sends the command to the Terminal instead of R Console
system(paste0("head ", hm3))
## has information on SNP ID, allele 1, allele 2.

## The munge() will also align the alleles across the GWAS sum stats and the reference data.

system(paste0("wc -l ", hm3))
## This command tells how many lines (= rows) there are in your file.
## Note that the number of lines also includes the line with column names.

#### How many SNPs are there in the HapMap3 reference file?
## 1217311 HM3 SNPs in the reference data

##### Step 1: Munge the data ####

## Look at the sum stats and the columns in these files
## 1. BMI
system("head GIANT_UKB_BMI_EUR_chr1.txt")
## 2. Height
system("head Yengo_Height_EUR_chr1.txt")

## Note that both files include sample size
## So, we'd not specify N in the munge() step.

## For multivariate LDSC, we specify a vector of the file paths with GWAS sum stats.
files <- c("GIANT_UKB_BMI_EUR_chr1.txt", "Yengo_Height_EUR_chr1.txt")

## Name the traits
## The order should match the order of file paths.
trait.names <- c("BMI_chr1", "Height_chr1")

#define the imputation quality filter
info.filter = 0.9
#define the MAF filter
maf.filter = 0.01

#run munge
munge(
  files = files,
  hm3 = hm3,
  trait.names = trait.names,
  # N = N,                       ## Not specifying N here.
  info.filter = info.filter,
  maf.filter = maf.filter
)

## This function will perform munge separately for each trait.
## Examine that the columns were interpreted correctly for both files.
## SNP, A1 (effect allele), A2 (other allele), Beta/Effect size, P value, N, MAF


##### Step 2: run LDSC ####

## Examine the munged sum stats
system("zcat BMI.sumstats.gz | head")
system("zcat Height.sumstats.gz | head")
# Note: Ignore the warning "gzip: stdout: Broken pipe"

# traits = the name of the .sumstats.gz traits produced by munge
traits <- c("BMI.sumstats.gz", "Height.sumstats.gz")

## ld = folder of LD scores
ld <- "eur_w_ld_chr/"

# wld = the same folder as LD scores
wld <- "eur_w_ld_chr/"

# sample.prev = the proportion of cases to total sample size. For quantitative traits list NA
sample.prev <- c(NA, NA)

# population.prev = the population lifetime prevalence of the traits. For quantitative traits list NA
population.prev <- c(NA, NA)

# trait.names = optional fifth argument to list trait names so they can be named in your model
# follow the same order as the munged files in "traits"
trait.names <- c("BMI", "Height")

LDSC_EUR <- ldsc(
  traits = traits,
  sample.prev = sample.prev,
  population.prev = population.prev,
  ld = ld,
  wld = wld,
  trait.names = trait.names
)

## Save for future use
save(LDSC_EUR, file = "LDSC_EUR.RData")

## The log/output has three main parts:
## [1/3] LDSC analyses of Trait 1 (h2_SNP)
## [2/3] Cross-trait LDSC analyses of Trait 1 and 2 (r_g)
## [3/3] LDSC analyses of Trait 2 (h2_SNP)

# Note that the heritability estimates and univariate LDSC intercepts obtained in univariate analyses
# will often be slightly different from heritability estimates and intercepts in bivariate analyses
# This is because bivariate analysis uses the intersection of the SNPs in the two GWAS sum stats.
# Univariate analysis uses all of the SNPs in the respective GWAS sum stats.
# Usually these two sets of SNPs will be slightly different, causing small differences in the estimates.


#genetic covariance matrix
LDSC_EUR$S

#genetic correlation matrix
cov2cor(LDSC_EUR$S)

#lets grab the standard errors from the V matrix
k <- nrow(LDSC_EUR$S)
SE <- matrix(0, k, k)
SE[lower.tri(SE, diag = TRUE)] <- sqrt(diag(LDSC_EUR$V))
SE

#matrix of Z-stats
Z <- LDSC_EUR$S / SE
Z

#matrix of p-values
P <- 2 * pnorm(Z, lower.tail = FALSE)
P


##### Step 3 (Optional): create rg heatmap ####

rownames(LDSC_EUR$S) <- colnames(LDSC_EUR$S)

corrplot(
  corr = cov2cor(LDSC_EUR$S),
  method = "color",
  addCoef.col = "dark grey",
  add = F,
  bg = "white",
  diag = T,
  outline = T,
  mar = c(0, 0, 2, 0),
  number.cex = 2,
  cl.pos = "b",
  cl.ratio = 0.125,
  cl.align.text = "l",
  cl.offset = 0.2,
  tl.srt = 45,
  tl.pos = "lt",
  tl.offset = 0.2,
  tl.col = "black",
  pch.col = "white",
  addgrid.col = "black",
  xpd = T,
  tl.cex = 1.3,
  is.corr = TRUE,
  title = "European"
)

rm(list = ls())


# PART 2: Continuous Traits Example in East Asian Ancestry ==========================================

setwd("../EAS")

## Similar steps as above.
## The sum stats and LD scores are from East Asian Ancestry


##### Step 1: Munge the data ####

system("head BMI_EAS_chr1.txt")
system("head Yengo_Height_EAS_chr1.txt")

files <- c("BMI_EAS_chr1.txt", "Yengo_Height_EAS_chr1.txt")

#define the reference file being used to align alleles across summary stats
#here we are using hapmap3
hm3 <- "eas_ldscores/w_hm3.snplist"

system(paste0("head ", hm3)) # Looks the same as before
system(paste0("wc -l ", hm3)) # Looks the same as before
# Has 1217311 SNPs

#name the traits
trait.names <- c("BMI", "Height")

# list the sample sizes.
# Yengo_Height has SNP-specific sample size in the file
# but BMI sum stats do not, so we will provide that here.
# NA for Height
N <- c(163835, NA)  

#definte the imputation quality filter
info.filter = 0.9
#define the MAF filter
maf.filter = 0.01

#run munge
munge(
  files = files,
  hm3 = hm3,
  trait.names = trait.names,
  N = N,
  info.filter = info.filter,
  maf.filter = maf.filter
)

## Again, examine that the columns were interpreted correctly for both files.
## SNP, A1 (effect allele), A2 (other allele), Beta/Effect size, P value, N, MAF


##### Step 2: run LDSC ####

## Examine the munged sum stats
system("zcat BMI.sumstats.gz | head")
system("zcat Height.sumstats.gz | head")
# Note: Ignore the warning "gzip: stdout: Broken pipe"

#traits = the name of the .sumstats.gz traits produced by munge
traits <- c("BMI.sumstats.gz", "Height.sumstats.gz")

##ld = folder of LD scores
ld <- "eas_ldscores/"

#wld = folder of LD scores
wld <- "eas_ldscores/"

#sample.prev = the proportion of cases to total sample size. For quantitative traits list NA
sample.prev <- c(NA, NA)

#population.prev = the population lifetime prevalence of the traits. For quantitative traits list NA
population.prev <- c(NA, NA)

#trait.names = optional fifth argument to list trait names so they can be named in your model
trait.names <- c("BMI", "Height")

LDSC_EAS <-
  ldsc(
    traits = traits,
    sample.prev = sample.prev,
    population.prev = population.prev,
    ld = ld,
    wld = wld,
    trait.names = trait.names
  )
# Save for later use
save(LDSC_EAS, file = "LDSC_EAS.RData")

#genetic covariance matrix
LDSC_EAS$S

## Note here that the diagonal elements are the SNP-h2 
## and the off-diagonal elements are genetic covariance

# Compare with EUR
load("../EUR/LDSC_EUR.RData")
LDSC_EUR$S

#genetic correlation matrix
cov2cor(LDSC_EAS$S)

#standard errors
k <- nrow(LDSC_EAS$S)
SE <- matrix(0, k, k)
SE[lower.tri(SE, diag = TRUE)] <- sqrt(diag(LDSC_EAS$V))
SE

#matrix of Z-stats
Z <- LDSC_EAS$S / SE
Z

#matrix of p-values
P <- 2 * pnorm(Z, lower.tail = FALSE)
P
# Note: the off-diagonal element gives the p-value of genetic correlation.

##### Step 3 (Optional): create rg heatmap ####

rownames(LDSC_EAS$S) <- colnames(LDSC_EAS$S)

corrplot(
  corr = cov2cor(LDSC_EAS$S),
  method = "color",
  addCoef.col = "dark grey",
  add = F,
  bg = "white",
  diag = T,
  outline = T,
  mar = c(0, 0, 2, 0),
  number.cex = 2,
  cl.pos = "b",
  cl.ratio = 0.125,
  cl.align.text = "l",
  cl.offset = 0.2,
  tl.srt = 45,
  tl.pos = "lt",
  tl.offset = 0.2,
  tl.col = "black",
  pch.col = "white",
  addgrid.col = "black",
  xpd = T,
  tl.cex = 1.3,
  is.corr = TRUE,
  title = "East Asian"
)

rm(list = ls())


# PART 3: Binary (case/control) Traits Example in European Ancestry ===============================

setwd("../EUR")


##### Step 1: Munge the data ####

system("head SCZ_chr1.txt")
system("head BIP_chr1.txt")

## Note that SCZ sum stats include the effective N (Neff)
## BIP sum stats include NEFFDIV2 (which is half of Neff)
## The munge function _can_ identify these different Ns.
## But we must ensure that these are interpreted correctly in the munge step below.

files <- c("SCZ_chr1.txt", "BIP_chr1.txt")

#define the reference file being used to align alleles across summary stats
#here we are using hapmap3
hm3 <- "eur_w_ld_chr/w_hm3.snplist"

#name the traits
trait.names <- c("SCZ", "BIP")

#list the sample sizes.
#we write NA here as these files already have Neff as columns
N <- c(NA, NA)

#define the imputation quality filter
info.filter = 0.9

#define the MAF filter
maf.filter = 0.01

#run munge
munge(
  files = files,
  hm3 = hm3,
  trait.names = trait.names,
  N = N,
  info.filter = info.filter,
  maf.filter = maf.filter
)

## Examine that the columns were interpreted correctly for both traits.

## Ensure that the reported NEFFDIV2 in BIP has been doubled to get Neff.


##### Step 2: run LDSC ####

## Examine the munged sum stats
system("zcat SCZ.sumstats.gz | head")
system("zcat BIP.sumstats.gz | head")
# Note: Ignore the warning "gzip: stdout: Broken pipe"

#traits = the name of the .sumstats.gz traits produced by munge
traits <- c("SCZ.sumstats.gz", "BIP.sumstats.gz")

##ld = folder of LD scores
ld <- "eur_w_ld_chr/"

#wld = folder of LD scores
wld <- "eur_w_ld_chr/"

#sample.prev: enter as 0.5 because used sum of effecitve N,
#that accounts for sample ascertainment
sample.prev <- c(.5, .5)

#population.prev = the population lifetime prevalence of the traits.
population.prev <- c(.01, .02)

#trait.names = optional fifth argument to list trait names so they can be named in your model
trait.names <- c("SCZ", "BIP")

LDSC_SCZ_BIP <-
  ldsc(
    traits = traits,
    sample.prev = sample.prev,
    population.prev = population.prev,
    ld = ld,
    wld = wld,
    trait.names = trait.names
  )

## The output first reports the observed-scale h2 and genetic covariance.
## At the end, the liability-scale estimates are reported.

## Save for later use
save(LDSC_SCZ_BIP , file = "LDSC_SCZ_BIP.RData")

#genetic covariance matrix
LDSC_SCZ_BIP$S

#genetic correlation matrix
cov2cor(LDSC_SCZ_BIP$S)

#lets grab the standard errors from the V matrix
k <- nrow(LDSC_SCZ_BIP$S)
SE <- matrix(0, k, k)
SE[lower.tri(SE, diag = TRUE)] <- sqrt(diag(LDSC_SCZ_BIP$V))

#matrix of Z-stats
Z <- LDSC_SCZ_BIP$S / SE

#matrix of p-values
P <- 2 * pnorm(Z, lower.tail = FALSE)


##### Step 3 (Optional): create rg heatmap ####

rownames(LDSC_SCZ_BIP$S) <- colnames(LDSC_SCZ_BIP$S)

corrplot(
  corr = cov2cor(LDSC_SCZ_BIP$S),
  method = "color",
  addCoef.col = "dark grey",
  add = F,
  bg = "white",
  diag = T,
  outline = T,
  mar = c(0, 0, 2, 0),
  number.cex = 2,
  cl.pos = "b",
  cl.ratio = 0.125,
  cl.align.text = "l",
  cl.offset = 0.2,
  tl.srt = 45,
  tl.pos = "lt",
  tl.offset = 0.2,
  tl.col = "black",
  pch.col = "white",
  addgrid.col = "black",
  xpd = T,
  tl.cex = 1.3,
  is.corr = TRUE,
  title = "SCZ / BIP"
)

rm(list = ls())


# END. ###########################