setwd("2024/PGSpractical")
library(data.table)

## BONUS get number of SNPs#####
snps_used <- fread("output/PGI_height_1kG_ph3_EAS.sscore.vars", header=F)
gwas <- fread("input/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_EAS_1e-2")
snps_used_pval <- merge(snps_used, gwas, by.x= "V1", by.y="RSID")
head(snps_used_pval)
# Number of SNPs with p<0.01
nrow(snps_used_pval[snps_used_pval$P<0.01,])
nrow(snps_used_pval[snps_used_pval$P<0.001,])
nrow(snps_used_pval[snps_used_pval$P<1e-5,])
nrow(snps_used_pval[snps_used_pval$P<5e-8,])


### STEP 6 #####
setwd("/home/YOURNAME/Day4/PGS/PGSpractical")
library(data.table)

# Read the PGS files 

df_PGI_p1e2 <- fread("output/PGI_height_1kG_ph3_EAS.p1e-2.sscore", header=T)
df_PGI_p1e3 <- fread("output/PGI_height_1kG_ph3_EAS.p1e-3.sscore", header=T)
df_PGI_p1e5 <- fread("output/PGI_height_1kG_ph3_EAS.p1e-5.sscore", header=T)
df_PGI_p5e8 <- fread("output/PGI_height_1kG_ph3_EAS.p5e-8.sscore", header=T)

# How do these files look like? 
head(df_PGI_p1e2)

# Make histograms
hist(df_PGI_p1e2$SCORE1_AVG, breaks=30)
hist(df_PGI_p1e3$SCORE1_AVG, breaks=30)
hist(df_PGI_p1e5$SCORE1_AVG, breaks=30)
hist(df_PGI_p5e8$SCORE1_AVG, breaks=30)

# Standardize the PGS
df_PGI_p1e2$PGI_p1e2_std <- scale(df_PGI_p1e2$SCORE1_AVG)
df_PGI_p1e3$PGI_p1e3_std <- scale(df_PGI_p1e3$SCORE1_AVG)
df_PGI_p1e5$PGI_p1e5_std <- scale(df_PGI_p1e5$SCORE1_AVG)
df_PGI_p5e8$PGI_p5e8_std <- scale(df_PGI_p5e8$SCORE1_AVG)

# Read in phenotype data
HEIGHT <- read.table("input/HEIGHT.pheno", header=T)
# notice that the phenotype data contains less participants than the genetic data 

# Merge all data frames
df <- merge(HEIGHT, df_PGI_p1e2, by="IID")
df <- merge(df, df_PGI_p1e3, by="IID")
df <- merge(df, df_PGI_p1e5, by="IID")
df <- merge(df, df_PGI_p5e8, by="IID")
# You might get a warning as some columns have the same name, but this does not affect us (ideally you would clean up this output)


# Run linear regression of height on each PGI and sex
model_p1e2 <- lm(height ~ PGI_p1e2_std + sex, data=df)
model_p1e3 <- lm(height ~ PGI_p1e3_std + sex, data=df)
model_p1e5 <- lm(height ~ PGI_p1e5_std + sex, data=df)
model_p5e8 <- lm(height ~ PGI_p5e8_std + sex, data=df)

summary(model_p5e8)

# Run linear regression of height on sex as the baseline model
model_baseline <- lm(height ~ sex, data=df)

# Get incremental Rsq
summary(model_p1e2)$r.squared - summary(model_baseline)$r.squared
summary(model_p1e3)$r.squared - summary(model_baseline)$r.squared
summary(model_p1e5)$r.squared - summary(model_baseline)$r.squared
summary(model_p5e8)$r.squared - summary(model_baseline)$r.squared

# Add PCs to the models
PC <- paste0("PC", 1:10)
PC <- paste0(PC, collapse="+")
model_baseline <- lm(paste0("height ~ sex +", PC), data=df)
model_p1e2 <- lm(paste0("height ~ PGI_p1e2_std + sex +",PC), data=df)
model_p1e3 <- lm(paste0("height ~ PGI_p1e3_std + sex +",PC), data=df)
model_p1e5 <- lm(paste0("height ~ PGI_p1e5_std + sex +",PC), data=df)
model_p5e8 <- lm(paste0("height ~ PGI_p5e8_std + sex +",PC), data=df)

# Get incremental-Rsq of adding the PGI to a model that controls for sex and 10 PCs
r2_p1e2 <- summary(model_p1e2)$r.squared - summary(model_baseline)$r.squared
r2_p1e3 <-summary(model_p1e3)$r.squared - summary(model_baseline)$r.squared
r2_p1e5 <-summary(model_p1e5)$r.squared - summary(model_baseline)$r.squared
r2_p5e8 <-summary(model_p5e8)$r.squared - summary(model_baseline)$r.squared

# Bootstrapping function to get 95% CIs
# set seed number
set.seed(42)

bs_func <- function(s,PGI){
  # draw indexes for resampling (with replacement)
  BOOT_IN = sample(1:nrow(df), nrow(df), replace=TRUE)
  
  # compute incremental R2 given the drawn indexes.
  summary(lm(paste0("height ~ sex +", PGI, "+", PC), df[BOOT_IN,]))$r.squared -  summary(lm(paste0("height ~ sex + ", PC), df[BOOT_IN,]))$r.squared  
}

# Run the bootstrapping function for each PGI
rep = 1000
pgi = c()
lci = c()
uci = c()
for (p in c("1e2", "1e3", "1e5", "5e8")) {
  BOOT = sapply(1:rep, bs_func, paste0("PGI_p",p,"_std"))
  BOOT = sort(BOOT)
  pgi = c(pgi,p)
  lci = c(lci,BOOT[25])
  uci = c(uci,BOOT[975])
}

#combine the results in a nice dataframe to make plotting easier 
rci <-  data.frame(pgi,lci, uci)
names(rci) <- c("pgi","lci","uci")
rci$estimate <- c(r2_p1e2,r2_p1e3,r2_p1e5,r2_p5e8)

# Create the barplot with error bars
library(ggplot2)
p <- ggplot(rci, aes(x = pgi, y = estimate)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "blue") +
  geom_errorbar(aes(ymin = lci, ymax = uci), width = 0.2, color = "black", size = 0.7) +
  labs(x = "PGI", y = "R-squared", title = "R-squared with 95% CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Optional: Rotate x-axis labels

# Print the plot
p