# PRSice code - run this in the ssh terminal window Rscript /usr/local/bin/PRSice.R --dir . --prsice /usr/local/bin/PRSice --base weights.prs --target ozbmi --thread 1 --snp SNP --chr CHR --bp BP --A1 A1 --A2 A2 --stat BETA --pvalue P --beta --pheno ozht.pheno --score sum --all-score --bar-levels 0.00000005,0.000001,0.0001,0.01,1 --fastscore --quantile 5 --quant-ref 1 --binary-target F --no-regress #*************************************************************************************************************************** # R code run in R Studio rm(list=ls()) setwd("~/2022/working") # Read in the phenotype covariance and prs files pheno<-read.table("ozht.pheno", header=T) covar<-read.table("ozht.cov", header=T) PRS<-read.table("PRSice.all_score", header=T) #lets give the PRS variables some more user friendly names head(PRS) names(PRS)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5") head(PRS) #and convert the height to cm pheno$ht<-pheno$ht*100 # Merge the pheno covar PRS and zyg files # we are using [ ] to subset the columns so that the FID is not duplicated in the merged files temp=merge(pheno, covar[,2:8], by="IID") longData=merge(temp, PRS[,2:7], by="IID") #Next we are going to run a Null model that includes the covariates but it doesn't include any PRS nullModel<-lm(ht~age+sex+PC1+PC2+PC3+PC4, data=longData) summary(nullModel) #Then we are going to run a model that includes the covariates and the first PRS thresh1<-lm(ht~age+sex+PC1+PC2+PC3+PC4+PRS1, data=longData) summary(thresh1) #We can compute the difference in R2 between the 2 models using the following code summary(thresh1)$r.squared-summary(nullModel)$r.squared #*********************************************************************************************************************** # Your task is to run a series of lm models to evaluate the predictive abilites of each of the 5 PRS. #*********************************************************************************************************************** # Read in the zygosity information zyg<-read.table("ozht.zyg", header=T) #We will then merge on the zyg data using the individual ID "IID" as the key. longData1=merge(longData, zyg[,2:5], by="IID") #Take a look at the result to check that it worked head(longData1) #For openMx we need our data to be in a wide format with one line per family so we will need to reshape the data. #In this code the variables listed in v.names are present for both twins #The variable Twin specifies if the person is twin 1 or 2 wideData=reshape(data = longData1, direction = "wide", v.names = c("IID", "ht", "age", "sex", "PC1", "PC2", "PC3", "PC4", "PRS1", "PRS2", "PRS3", "PRS4", "PRS5"), idvar = "FID", timevar = "Twin", sep="_") head(wideData) # We will replace all missingness in the definition variables with 0 # the people with missing definition variables don't have phenotypic data so this will not bais estimates wideData[,c(6:16,19:29)][is.na(wideData[,c(6:16,19:29)])] <- 0 #*********************************************************************************************************************** # Next we are going to run the model in openMx to correct for relatedness # ---------------------------------------------------------------------------------------------------------------------- # Author: Sarah Medland (adpted from oneSATca.R by Hermine Maes) # Date: 01 06 2022 # # Twin Univariate Saturated model with one means and one variance across multiple groups - with PRS as a covariate # Matrix style model - Raw data - Continuous data # -------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------| # Load Libraries & Options library(OpenMx) mxOption( NULL, "Default optimizer", "NPSOL" ) source("miFunctions5272022.R") # Create Output #filename <- "SaturatedModel_withPRS" #sink(paste(filename,".Ro",sep=""), append=FALSE, split=TRUE) # ---------------------------------------------------------------------------------------------------------------------- # Data used in this script will be the wideData object # ---------------------------------------------------------------------------------------------------------------------- # Select Variables for Analysis vars <- 'ht' # list of variables names nv <- 1 # number of variables ntv <- nv*2 # number of total variables depVar <- c("ht_1","ht_2") # dependent variables indVar <- c("age_1", "sex_1", "PC1_1", "PC2_1", "PC3_1", "PC4_1", "PRS1_1", "age_2", "sex_2", "PC1_2", "PC2_2", "PC3_2", "PC4_2", "PRS1_2") # independent variables or predictors selVars <- c(depVar,indVar) # Select Data for Analysis mzData <- subset(wideData, MZDZ==1, selVars) dzData <- subset(wideData, MZDZ==2, selVars) # Set Starting Values svMe <- 170 # start value for means svVa <- 100 # start value for variance lbVa <- 0.00001 # lower bound for variance svBe <- 0.0001 # start value for means # Create Data Objects for Multiple Groups dataMZ <- mxData( observed=mzData, type="raw" ) dataDZ <- mxData( observed=dzData, type="raw" ) # ---------------------------------------------------------------------------------------------------------------------- # PREPARE MODEL # Create Algebra for expected Mean Matrices # Objects to hold definition variables for the regression dPC1 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC1_1","data.PC1_2"), name="dPC1" ) dPC2 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC2_1","data.PC2_2"), name="dPC2" ) dPC3 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC3_1","data.PC3_2"), name="dPC3" ) dPC4 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC4_1","data.PC4_2"), name="dPC4" ) dsex <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.sex_1","data.sex_2"), name="dsex" ) dage <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.age_1","data.age_2"), name="dage" ) dPRS <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PRS1_1","data.PRS1_2"), name="dPRS" ) # Objects to hold betas for the covariates bPC1 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b1", name="bPC1" ) bPC2 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b2", name="bPC2" ) bPC3 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b3", name="bPC3" ) bPC4 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b4", name="bPC4" ) bsex <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b5", name="bsex" ) bage <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b6", name="bage" ) bPRS <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b7", name="bPRS" ) # Build the means model meanG <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svMe, labels=labVars("mean",vars), name="meanG" ) ExpMean <- mxAlgebra( expression= (meanG + bPC1%x%dPC1 + bPC2%x%dPC2 + bPC3%x%dPC3 + bPC4%x%dPC4 + bsex%x%dsex + bage%x%dage + bPRS%x%dPRS) , name="expMean" ) pars<- c(bPC1, bPC2, bPC3, bPC4, bsex, bage, bPRS, meanG) defs<- c(dPC1, dPC2, dPC3, dPC4, dsex, dage, dPRS) # Create Algebra for expected Variance/Covariance Matrices covMZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv), labels=c("var","cMZ21","var"), name="covMZ" ) covDZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv), labels=c("var","cDZ21","var"), name="covDZ" ) # Create Expectation Objects for Multiple Groups expMZ <- mxExpectationNormal( covariance="covMZ", means="expMean", dimnames=depVar ) expDZ <- mxExpectationNormal( covariance="covDZ", means="expMean", dimnames=depVar ) funML <- mxFitFunctionML() # Create Model Objects for Multiple Groups modelMZ <- mxModel( pars, defs, ExpMean, covMZ, dataMZ, expMZ, funML, name="MZ" ) modelDZ <- mxModel( pars, defs, ExpMean, covDZ, dataDZ, expDZ, funML, name="DZ" ) multi <- mxFitFunctionMultigroup( c("MZ","DZ") ) # Build Saturated Model model_wPRS <- mxModel( "wPRS", pars, modelMZ, modelDZ, multi ) # ---------------------------------------------------------------------------------------------------------------------- # RUN MODEL # Run Saturated Model fit_wPRS <- mxRun( model_wPRS ) (sum_wPRS <- summary( fit_wPRS )) # ---------------------------------------------------------------------------------------------------------------------- # RUN SUBMODELS # Drop the PRS from the model (running the null model) model_Null <- mxModel( fit_wPRS, name="noPRS" ) model_Null <- omxSetParameters( model_Null, labels="b7", free=FALSE, values=0 ) fit_Null <- mxRun( model_Null) (sum_Null <- summary( fit_Null )) # ---------------------------------------------------------------------------------------------------------------------- # Print Comparative Fit Statistics mxCompare( fit_wPRS, fit_Null ) # calculate r2 # difference in variance between the models gives variance explained by PRS # divided by total variance to express as a % (fit_Null$MZ$covMZ$values[1,2]-fit_wPRS$MZ$covMZ$values[1,2])/fit_Null$MZ$covMZ$values[1,2] # ---------------------------------------------------------------------------------------------------------------------- #sink() #*********************************************************************************************************************** # Your task is to run a series of openMx models to evaluate the predictive abilites of each of the 5 PRS. #*********************************************************************************************************************** - remember to find and replace all mentions of the PRS variable in the openMx code #*********************************************************************************************************************** # Next we are going to run the analyses in GCTA #*********************************************************************************************************************** !!! Run this in the ssh terminal # Make GRM using GCTA gcta64 --bfile ozbmi --make-grm --out ozGRM !!!!Run the code below in R #We can take a look at the contents of the GRM using the following code # Open GRM and visualise off-diagonal distribution ReadGRMBin=function(prefix, AllN=F, size=4){ sum_i=function(i){ return(sum(1:i)) } BinFileName=paste(prefix,".grm.bin",sep="") NFileName=paste(prefix,".grm.N.bin",sep="") IDFileName=paste(prefix,".grm.id",sep="") id = read.table(IDFileName) n=dim(id)[1] BinFile=file(BinFileName, "rb"); grm=readBin(BinFile, n=n*(n+1)/2, what=numeric(0), size=size) NFile=file(NFileName, "rb"); if(AllN==T){ N=readBin(NFile, n=n*(n+1)/2, what=numeric(0), size=size) } else N=readBin(NFile, n=1, what=numeric(0), size=size) i=sapply(1:n, sum_i) return(list(diag=grm[i], off=grm[-i], id=id, N=N)) } GRM=ReadGRMBin(prefix = "ozGRM") #We can look at the distribution of diagonal values (ie the covariation of a person with themselves) using the following code. hist(GRM$diag, breaks=100) # Large values can indicate data problems or outstanding homozygosity rate #Usually we would expect the values to be close to 1. This teaching example only includes ~10,000 snps so there is more variation here than we would usually expect. #How many off diagonal elements are in this matrix (ie covarations between people)? length(GRM$off) #We can look at the off diagnoal elements using the following code hist(GRM$off, breaks = 100) #This plot isn't very informative as most relationship coefficents are close to 0. We can update our plot to only show relationship coefficents greater than 0.05 using the following code hist(GRM$off[which(GRM$off>0.05)], breaks=100) #This plot clearly shows we have a lot of siblings and MZ twin pairs. If we zoom in even further we might spot those cousins hist(GRM$off[which(GRM$off>0.05)], breaks=100, xlim=c(0.05,0.3) #We will use R to write out the files to run GCTA #We'll start by reading in the data files again incase they have been lost prs=read.table("PRSice.all_score", header=T) covar<-read.table("ozht.cov", header=T) pheno<-read.table("ozht.pheno", header=T) #Then rescaling and renaming pheno$ht<-pheno$ht*100 names(prs)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5") #Then merging the data into 1 dataframe all=merge(covar, prs, by=c("IID", "FID")) all=merge(all, pheno, by=c("IID", "FID")) # Then we'll write out the covariate files for the continuous covariates that we want to include in the GCTA model # It's important that the PRS is in the last column of this file write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS1")], "ozht.prs1.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS2")], "ozht.prs2.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS3")], "ozht.prs3.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS4")], "ozht.prs4.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS5")], "ozht.prs5.qcovar", quote = F, row.names = F) # Then we'll write out the covariate files for the binary covariates that we want to include in the GCTA model write.table(all[,c("FID", "IID", "sex")], "ozht.covar", quote = F, row.names = F) # Then we'll write out the phenotype files for the GCTA model write.table(all[,c("FID", "IID", "ht")], "ozht.phen", quote = F, row.names = F) #Now we are ready to run our gcta analyses. !!!You will run these analyses by pasting this code into the ssh terminal gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS1 --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS2 --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS3 --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS4 --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS5 --reml --reml-est-fix #GCTA will make a .log file and an .hsq file for each model. #The beta and SE of the PRS effect can be found in the line labeled X_7 in the log files. #We can pull these out of the log files and save them to a file with the following code grep X_7 *.log > gcta_results.txt !!!! Back to R # We can read in the gcta results res<-as.data.frame(read.table("gcta_results.txt")) names(res)<-c("Model","beta","se") # To calculate R2 we scale beta into a marginal correlation and then square it # We are calculating this and adding it to the res object res$r2<-rbind((res[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2, (res[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2, (res[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2, (res[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2, (res[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2) # To test the significance of the betas we can use t-tests res$tStat<-res$beta/res$se # Follows a student distribution res$pval<-(1-pt(q=tStat, df = 1887, lower.tail = T))*2 # Now we can look at the results res #*********************************************************************************************************************** # Now we are going to make some plots #*********************************************************************************************************************** # Make barplots to summarise associations r2 <- data.frame( name = c("5e-08", "1e-06", "0.0001", "0.01", "1"), R2 = c(1.236,1.913,2.218,2.340,2.561 ), P=c(0.000195,5.04e-06,6.91e-07,2.67e-07, 6.12e-08) ) # Increase bottom margin par(mar=c(4,4,4,4)) # Barplot #jpeg("height.R2.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo") my_bar <- barplot(r2$R2 , border=F , names.arg=r2$name , las=2 , col=c('skyblue1','skyblue2','skyblue3','skyblue4') , ylim=c(0,3) , main="Predicting height from height PRS", ylab="% variance explained", xlab="PRS Thresholds") # Add P values to the bars text(my_bar, r2$R2+0.04 , paste("P=", r2$P, sep="") ,cex=1) #dev.off() # Next we'll plot height by quintile of PRS (often these plots are made with deciles but the sample is small so we're going to use quintiles here) library(Hmisc) library(gplots) # the following command divides the data into quintiles. longData$percentiles <- factor(cut2(longData$PRS5, g=5, levels.mean=TRUE)) # Plot the mean of height by precentile with 95% CIs #jpeg("Quintile.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo") plotmeans(ht ~ percentiles, data = longData, xlab="Quintile of PRS", ylab="Height in Cm",legends = c("1","2","3","4","5")) #dev.off() #**************************************************************************************************************** # STRETCH GOAL - Optional analysis - adapt code to fit ACE model in GCTA # We can make the CRM by modyfing the GRM hist(GRM$off[which(GRM$off>0.20)], breaks=100) # Remember that the sample contains MZ and DZ twins (or siblings) # Format GRM into a NxN matrix (simpler to write later) asNxNGRM<-function(BRMbin){ mat<-matrix(0, nrow = length(BRMbin$diag), ncol = length(BRMbin$diag)) mat[upper.tri(mat)]<-BRMbin$off mat<-mat+t(mat) diag(mat)<-BRMbin$diag colnames(mat)<-BRMbin$id[,2] rownames(mat)<-BRMbin$id[,2] return(mat) } # convert GRM into NxN matrix GRMmat=asNxNGRM(GRM) # We can build the CRM as a modified GRM CRM=GRMmat CRM[which(CRM> 0.20, arr.ind = T)]=1 # we assume that individuals with GRM>0.2 have grown up together -> C=1 CRM[which(CRM< 0.20, arr.ind = T)]=0 # other less related individuals have no shared environment -> C=0 # Check how many 1 and 0 we have table(CRM[upper.tri(CRM)]) # 978 families table(diag(CRM)) # check diagonal # We get the info about family ID and IID - needed to create the .grm.id file fam=GRM$id colnames(fam)=c("fam", "id") # We get this package which allows writing a GRM # install.packages("genio") library(genio) # needed to write_grm genio::write_grm(kinship = CRM, name = "ozCRM", fam = fam ) # We will also write out a continuous covariate file that does not include a PRS write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4")], "ozht.noprs.qcovar", quote = F, row.names = F) ## Fit several random effects in GCTA !!!!!!!!! This code should be run in terminal # We create this text file, which stores the name of the different GRM/CRM matrices so GCTA knows which to open echo "ozGRM" > GRM-CRM.txt echo "ozCRM" >> GRM-CRM.txt head GRM-CRM.txt gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS1.ACE --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS2.ACE --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS3.ACE --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS4.ACE --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS5.ACE --reml --reml-est-fix gcta64 --pheno ozht.phen --qcovar ozht.noprs.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozht.ACE.noPRS --reml --reml-est-fix grep X_7 *ACE.log > gcta_results_ACE.txt ## Format results !!!!!!!!! This code should be run in Rstudio # We can read in the gcta results resACE<-as.data.frame(read.table("gcta_results_ACE.txt")) names(resACE)<-c("Model","beta","se") # To calculate R2 we scale beta into a marginal correlation and then square it # We are calculating this and adding it to the resACE object resACE$r2<-rbind((resACE[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2, (resACE[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2, (resACE[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2, (resACE[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2, (resACE[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2) # To test the significance of the betas we can use t-tests # We are calculating this and adding it to the resACE object resACE$tStat<-resACE$beta/resACE$se # Follows a student distribution resACE # Now we can look at the results resACE