setwd("2024/PGSpractical") library(data.table) ## BONUS get number of SNPs##### snps_used <- fread("output/PGI_height_1kG_ph3_EAS.sscore.vars", header=F) gwas <- fread("input/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_EAS_1e-2") snps_used_pval <- merge(snps_used, gwas, by.x= "V1", by.y="RSID") head(snps_used_pval) # Number of SNPs with p<0.01 nrow(snps_used_pval[snps_used_pval$P<0.01,]) nrow(snps_used_pval[snps_used_pval$P<0.001,]) nrow(snps_used_pval[snps_used_pval$P<1e-5,]) nrow(snps_used_pval[snps_used_pval$P<5e-8,]) ### STEP 6 ##### setwd("/home/YOURNAME/Day4/PGS/PGSpractical") library(data.table) # Read the PGS files df_PGI_p1e2 <- fread("output/PGI_height_1kG_ph3_EAS.p1e-2.sscore", header=T) df_PGI_p1e3 <- fread("output/PGI_height_1kG_ph3_EAS.p1e-3.sscore", header=T) df_PGI_p1e5 <- fread("output/PGI_height_1kG_ph3_EAS.p1e-5.sscore", header=T) df_PGI_p5e8 <- fread("output/PGI_height_1kG_ph3_EAS.p5e-8.sscore", header=T) # How do these files look like? head(df_PGI_p1e2) # Make histograms hist(df_PGI_p1e2$SCORE1_AVG, breaks=30) hist(df_PGI_p1e3$SCORE1_AVG, breaks=30) hist(df_PGI_p1e5$SCORE1_AVG, breaks=30) hist(df_PGI_p5e8$SCORE1_AVG, breaks=30) # Standardize the PGS df_PGI_p1e2$PGI_p1e2_std <- scale(df_PGI_p1e2$SCORE1_AVG) df_PGI_p1e3$PGI_p1e3_std <- scale(df_PGI_p1e3$SCORE1_AVG) df_PGI_p1e5$PGI_p1e5_std <- scale(df_PGI_p1e5$SCORE1_AVG) df_PGI_p5e8$PGI_p5e8_std <- scale(df_PGI_p5e8$SCORE1_AVG) # Read in phenotype data HEIGHT <- read.table("input/HEIGHT.pheno", header=T) # notice that the phenotype data contains less participants than the genetic data # Merge all data frames df <- merge(HEIGHT, df_PGI_p1e2, by="IID") df <- merge(df, df_PGI_p1e3, by="IID") df <- merge(df, df_PGI_p1e5, by="IID") df <- merge(df, df_PGI_p5e8, by="IID") # You might get a warning as some columns have the same name, but this does not affect us (ideally you would clean up this output) # Run linear regression of height on each PGI and sex model_p1e2 <- lm(height ~ PGI_p1e2_std + sex, data=df) model_p1e3 <- lm(height ~ PGI_p1e3_std + sex, data=df) model_p1e5 <- lm(height ~ PGI_p1e5_std + sex, data=df) model_p5e8 <- lm(height ~ PGI_p5e8_std + sex, data=df) summary(model_p5e8) # Run linear regression of height on sex as the baseline model model_baseline <- lm(height ~ sex, data=df) # Get incremental Rsq summary(model_p1e2)$r.squared - summary(model_baseline)$r.squared summary(model_p1e3)$r.squared - summary(model_baseline)$r.squared summary(model_p1e5)$r.squared - summary(model_baseline)$r.squared summary(model_p5e8)$r.squared - summary(model_baseline)$r.squared # Add PCs to the models PC <- paste0("PC", 1:10) PC <- paste0(PC, collapse="+") model_baseline <- lm(paste0("height ~ sex +", PC), data=df) model_p1e2 <- lm(paste0("height ~ PGI_p1e2_std + sex +",PC), data=df) model_p1e3 <- lm(paste0("height ~ PGI_p1e3_std + sex +",PC), data=df) model_p1e5 <- lm(paste0("height ~ PGI_p1e5_std + sex +",PC), data=df) model_p5e8 <- lm(paste0("height ~ PGI_p5e8_std + sex +",PC), data=df) # Get incremental-Rsq of adding the PGI to a model that controls for sex and 10 PCs r2_p1e2 <- summary(model_p1e2)$r.squared - summary(model_baseline)$r.squared r2_p1e3 <-summary(model_p1e3)$r.squared - summary(model_baseline)$r.squared r2_p1e5 <-summary(model_p1e5)$r.squared - summary(model_baseline)$r.squared r2_p5e8 <-summary(model_p5e8)$r.squared - summary(model_baseline)$r.squared # Bootstrapping function to get 95% CIs # set seed number set.seed(42) bs_func <- function(s,PGI){ # draw indexes for resampling (with replacement) BOOT_IN = sample(1:nrow(df), nrow(df), replace=TRUE) # compute incremental R2 given the drawn indexes. summary(lm(paste0("height ~ sex +", PGI, "+", PC), df[BOOT_IN,]))$r.squared - summary(lm(paste0("height ~ sex + ", PC), df[BOOT_IN,]))$r.squared } # Run the bootstrapping function for each PGI rep = 1000 pgi = c() lci = c() uci = c() for (p in c("1e2", "1e3", "1e5", "5e8")) { BOOT = sapply(1:rep, bs_func, paste0("PGI_p",p,"_std")) BOOT = sort(BOOT) pgi = c(pgi,p) lci = c(lci,BOOT[25]) uci = c(uci,BOOT[975]) } #combine the results in a nice dataframe to make plotting easier rci <- data.frame(pgi,lci, uci) names(rci) <- c("pgi","lci","uci") rci$estimate <- c(r2_p1e2,r2_p1e3,r2_p1e5,r2_p5e8) # Create the barplot with error bars library(ggplot2) p <- ggplot(rci, aes(x = pgi, y = estimate)) + geom_bar(stat = "identity", fill = "skyblue", color = "blue") + geom_errorbar(aes(ymin = lci, ymax = uci), width = 0.2, color = "black", size = 0.7) + labs(x = "PGI", y = "R-squared", title = "R-squared with 95% CI") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Optional: Rotate x-axis labels # Print the plot p