--- title: "Day 7 ISGW workshop tutorial" author: "[Sarah Medland & Baptiste Couvy-Duchesne](https://github.com/baptisteCD)" date: "`r format(Sys.time(), '%d %B %Y')`" mail: "baptiste.couvyduchesne@gmail.com" github: "baptisteCD" output: epuRate::ISGW: toc: TRUE number_sections: FALSE code_folding: "show" --- ```{r, echo=FALSE, message=FALSE, warning=FALSE} # You need these libraries to run this template: library(rmarkdown) # install.packages("rmarkdown") library(epuRate) # devtools::install_github("baptisteCD/epuRate", force=TRUE) # Soft-wrap code in knitting knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE) ```



> Answer sheet for the tutorial # Setting up environment ```{bash, message=FALSE, warning=FALSE, eval=F} mkdir -p day7 cd day7 cp -r /faculty/sarah/2022/Day7/* . ``` # PRS calculation using PRSice ```{bash, message=FALSE, warning=FALSE, eval=F} # PRSice code - run this in the ssh terminal window Rscript /usr/local/bin/PRSice.R --dir . --prsice /usr/local/bin/PRSice --base weights.prs --target ozbmi --thread 1 --snp SNP --chr CHR --bp BP --A1 A1 --A2 A2 --stat BETA --pvalue P --beta --pheno ozht.pheno --score sum --all-score --bar-levels 0.00000005,0.000001,0.0001,0.01,1 --fastscore --quantile 5 --quant-ref 1 --binary-target F --no-regress ``` # Open tables and merge ```{R, message=FALSE, warning=FALSE, eval=T} pheno<-read.table("ozht.pheno", header=T) covar<-read.table("ozht.cov", header=T) PRS<-read.table("PRSice.all_score", header=T) head(PRS) names(PRS)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5") head(PRS) pheno$ht<-pheno$ht*100 # Best to use names instead of column number temp=merge(pheno, covar[,2:8], by="IID") longData=merge(temp, PRS[,2:7], by="IID") # Run models nullModel<-lm(ht~age+sex+PC1+PC2+PC3+PC4, data=longData) summary(nullModel) #Then we are going to run a model that includes the covariates and the first PRS thresh1<-lm(ht~age+sex+PC1+PC2+PC3+PC4+PRS1, data=longData) summary(thresh1) #We can compute the difference in R2 between the 2 models using the following code summary(thresh1)$r.squared-summary(nullModel)$r.squared ``` # Format data - to one row per family (long -> wide) ```{R, message=FALSE, warning=FALSE, eval=T} zyg<-read.table("ozht.zyg", header=T) #We will then merge on the zyg data using the individual ID "IID" as the key. longData1=merge(longData, zyg[,2:5], by="IID") #Take a look at the result to check that it worked head(longData1) #For openMx we need our data to be in a wide format with one line per family so we will need to reshape the data. #In this code the variables listed in v.names are present for both twins #The variable Twin specifies if the person is twin 1 or 2 wideData=reshape(data = longData1, direction = "wide", v.names = c("IID", "ht", "age", "sex", "PC1", "PC2", "PC3", "PC4", "PRS1", "PRS2", "PRS3", "PRS4", "PRS5"), idvar = "FID", timevar = "Twin", sep="_" ) # We will replace all missingness in the definition variables with 0 # the people with missing definition variables don't have phenotypic data so this will not bais estimates wideData[,c(6:12,19:25)][is.na(wideData[,c(6:12,19:25)])] <- 0 ``` # OpenMx model ```{R, message=FALSE, warning=FALSE, eval=T} library(OpenMx) source("miFunctions5272022.R") # Select Variables for Analysis vars <- 'ht' # list of variables names nv <- 1 # number of variables ntv <- nv*2 # number of total variables depVar <- c("ht_1","ht_2") # dependent variables indVar <- c("age_1", "sex_1", "PC1_1", "PC2_1", "PC3_1", "PC4_1", "PRS1_1", "age_2", "sex_2", "PC1_2", "PC2_2", "PC3_2", "PC4_2", "PRS1_2") # independent variables or predictors selVars <- c(depVar,indVar) # Select Data for Analysis mzData <- subset(wideData, MZDZ==1, selVars) dzData <- subset(wideData, MZDZ==2, selVars) # Set Starting Values svMe <- 170 # start value for means svVa <- 100 # start value for variance lbVa <- 0.00001 # lower bound for variance svBe <- 0.0001 # start value for means # Create Data Objects for Multiple Groups dataMZ <- mxData( observed=mzData, type="raw" ) dataDZ <- mxData( observed=dzData, type="raw" ) # Objects to hold definition variables for the regression dPC1 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC1_1","data.PC1_2"), name="dPC1" ) dPC2 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC2_1","data.PC2_2"), name="dPC2" ) dPC3 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC3_1","data.PC3_2"), name="dPC3" ) dPC4 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC4_1","data.PC4_2"), name="dPC4" ) dsex <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.sex_1","data.sex_2"), name="dsex" ) dage <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.age_1","data.age_2"), name="dage" ) dPRS <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PRS1_1","data.PRS1_2"), name="dPRS" ) # Objects to hold betas for the covariates bPC1 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b1", name="bPC1" ) bPC2 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b2", name="bPC2" ) bPC3 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b3", name="bPC3" ) bPC4 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b4", name="bPC4" ) bsex <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b5", name="bsex" ) bage <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b6", name="bage" ) bPRS <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b7", name="bPRS" ) #Then we will build the model for the means that includes the regression tems #(we are using Kronecker products here to mutliply the values of the independent variables by the betas) meanG <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svMe, labels=labVars("mean",vars), name="meanG" ) ExpMean <- mxAlgebra( expression= (meanG + bPC1%x%dPC1 + bPC2%x%dPC2 + bPC3%x%dPC3 + bPC4%x%dPC4 + bsex%x%dsex + bage%x%dage + bPRS%x%dPRS) , name="expMean" ) #and do some house keeping (making a list of the objects so we can pull these into the model later) pars<- c(bPC1, bPC2, bPC3, bPC4, bsex, bage, bPRS, meanG) defs<- c(dPC1, dPC2, dPC3, dPC4, dsex, dage, dPRS) # Create Algebra for expected Variance/Covariance Matrices covMZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv),labels=c("var","cMZ21","var"), name="covMZ" ) covDZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv), labels=c("var","cDZ21","var"), name="covDZ" ) #Next we build the expection and model objects # Create Expectation Objects for Multiple Groups expMZ <- mxExpectationNormal( covariance="covMZ", means="expMean", dimnames=depVar ) expDZ <- mxExpectationNormal( covariance="covDZ", means="expMean", dimnames=depVar ) funML <- mxFitFunctionML() # Create Model Objects for Multiple Groups modelMZ <- mxModel( pars, defs, ExpMean, covMZ, dataMZ, expMZ, funML, name="MZ" ) modelDZ <- mxModel( pars, defs, ExpMean, covDZ, dataDZ, expDZ, funML, name="DZ" ) multi <- mxFitFunctionMultigroup( c("MZ","DZ") ) # Build Saturated Model model_wPRS <- mxModel( "wPRS", pars, modelMZ, modelDZ, multi ) # Run model fit_wPRS <- mxRun( model_wPRS ) (sum_wPRS <- summary( fit_wPRS )) #And then run a submodel whic drops the PRS effect from the model # Drop the PRS from the model (running the null model) model_Null <- mxModel( fit_wPRS, name="noPRS" ) model_Null <- omxSetParameters( model_Null, labels="b7", free=FALSE, values=0 ) fit_Null <- mxRun( model_Null) (sum_Null <- summary( fit_Null )) #We can work out the significance of the PRS effect by comparing the fit of the models mxCompare( fit_wPRS, fit_Null ) #and calculate r2 as the difference in variance between the models gives variance explained by PRS, divided by total variance (fit_Null$MZ$covMZ$values[1,2]-fit_wPRS$MZ$covMZ$values[1,2])/fit_Null$MZ$covMZ$values[1,2] ``` # Calculate GRM using GCTA ```{bash, message=FALSE, warning=FALSE, eval=T} ./gcta64 --bfile ozbmi --make-grm --out ozGRM ``` # Open GRM and draw some plots ```{R, message=FALSE, warning=FALSE, eval=T} # Open GRM and visualise off-diagonal distribution ReadGRMBin=function(prefix, AllN=F, size=4){ sum_i=function(i){ return(sum(1:i)) } BinFileName=paste(prefix,".grm.bin",sep="") NFileName=paste(prefix,".grm.N.bin",sep="") IDFileName=paste(prefix,".grm.id",sep="") id = read.table(IDFileName) n=dim(id)[1] BinFile=file(BinFileName, "rb"); grm=readBin(BinFile, n=n*(n+1)/2, what=numeric(0), size=size) NFile=file(NFileName, "rb"); if(AllN==T){ N=readBin(NFile, n=n*(n+1)/2, what=numeric(0), size=size) } else N=readBin(NFile, n=1, what=numeric(0), size=size) i=sapply(1:n, sum_i) return(list(diag=grm[i], off=grm[-i], id=id, N=N)) } GRM=ReadGRMBin(prefix = "ozGRM") hist(GRM$diag, breaks=100) # Large values can indicate data problems or outstanding homozygosity rate # hist(GRM$off, breaks = 100) # This plot isn't very informative as most relationship coefficents are close to 0. We can update our plot to only show relationship coefficents greater than 0.05 using the following code hist(GRM$off[which(GRM$off>0.05)], breaks=100) #This plot clearly shows we have a lot of siblings and MZ twin pairs. If we zoom in even further we might spot those cousins hist(GRM$off[which(GRM$off>0.05)], breaks=100, xlim=c(0.05,0.3)) ``` # Write files for GCTA analysis ```{R, message=FALSE, warning=FALSE, eval=T} prs=read.table("PRSice.all_score", header=T) covar<-read.table("ozht.cov", header=T) pheno<-read.table("ozht.pheno", header=T) #Then rescaling and renaming pheno$ht<-pheno$ht*100 names(prs)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5") #Then merging the data into 1 dataframe all=merge(covar, prs, by=c("IID", "FID")) all=merge(all, pheno, by=c("IID", "FID")) # Then we'll write out the covariate files for the continuous covariates that we want to include in the GCTA model # It's important that the PRS is in the last column of this file write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS1")], "ozht.prs1.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS2")], "ozht.prs2.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS3")], "ozht.prs3.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS4")], "ozht.prs4.qcovar", quote = F, row.names = F) write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS5")], "ozht.prs5.qcovar", quote = F, row.names = F) # Then we'll write out the covariate files for the binary covariates that we want to include in the GCTA model write.table(all[,c("FID", "IID", "sex")], "ozht.covar", quote = F, row.names = F) # Then we'll write out the phenotype files for the GCTA model write.table(all[,c("FID", "IID", "ht")], "ozht.phen", quote = F, row.names = F) ``` # Run GCTA with fixed and random effect (AE model) ```{bash, message=FALSE, warning=FALSE, eval=T} ./gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS1 --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS2 --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS3 --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS4 --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS5 --reml --reml-est-fix #GCTA will make a .log file and an .hsq file for each model. #The beta and SE of the PRS effect can be found in the line labeled X_7 in the log files. #We can pull these out of the log files and save them to a file with the following code grep X_7 *.log > gcta_results.txt ``` # Format GCTA outputs predictivness ```{R, message=FALSE, warning=FALSE, eval=T} # We can read in the gcta results res<-as.data.frame(read.table("gcta_results.txt")) names(res)<-c("Model","beta","se") # To calculate R2 we scale beta into a marginal correlation and then square it # We are calculating this and adding it to the res object res$r2<-rbind((res[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2, (res[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2, (res[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2, (res[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2, (res[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2) # To test the significance of the betas we can use t-tests # We are calculating this and adding it to the res object res$tStat<-res$beta/res$se # Follows a student distribution res$pval<-(1-pt(q=res$tStat, df = 1887, lower.tail = T))*2 # Now we can look at the results res ``` # Make barplots to summarise associations ```{R, message=FALSE, warning=FALSE, eval=T} r2 <- data.frame( name = c("5e-08", "1e-06", "0.0001", "0.01", "1"), R2 = c(1.236,1.913,2.218,2.340,2.561 ), P=c(0.000195,5.04e-06,6.91e-07,2.67e-07, 6.12e-08) ) # Increase bottom margin par(mar=c(4,4,4,4)) # Barplot #jpeg("height.R2.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo") my_bar <- barplot(r2$R2 , border=F , names.arg=r2$name , las=2 , col=c('skyblue1','skyblue2','skyblue3','skyblue4') , ylim=c(0,3) , main="Predicting height from height PRS", ylab="% variance explained", xlab="PRS Thresholds") # Add P values to the bars text(my_bar, r2$R2+0.04 , paste("P=", r2$P, sep="") ,cex=1) #dev.off() # Next we'll plot height by quintile of PRS (often these plots are made with deciles but the sample is small so we're going to use quintiles here) library(Hmisc) library(gplots) # the following command divides the data into quintiles. longData$percentiles <- factor(cut2(longData$PRS5, g=5, levels.mean=TRUE)) # Plot the mean of height by precentile with 95% CIs #jpeg("Quintile.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo") plotmeans(ht ~ percentiles, data = longData, xlab="Quintile of PRS", ylab="Height in Cm",legends = c("1","2","3","4","5")) #dev.off() ``` # Optional analysis - adapt code to fit ACE model in GCTA ## Make matrix of variance covariance for the shared environment ```{r, message=FALSE, warning=FALSE, eval=T} # We can make the CRM by modyfing the GRM hist(GRM$off[which(GRM$off>0.20)], breaks=100) # Remember that the sample contains MZ and DZ twins (or siblings) # Format GRM into a NxN matrix (simpler to write later) asNxNGRM<-function(BRMbin){ mat<-matrix(0, nrow = length(BRMbin$diag), ncol = length(BRMbin$diag)) mat[upper.tri(mat)]<-BRMbin$off mat<-mat+t(mat) diag(mat)<-BRMbin$diag colnames(mat)<-BRMbin$id[,2] rownames(mat)<-BRMbin$id[,2] return(mat) } # convert GRM into NxN matrix GRMmat=asNxNGRM(GRM) # We can build the CRM as a modified GRM CRM=GRMmat CRM[which(CRM> 0.20, arr.ind = T)]=1 # we assume that individuals with GRM>0.2 have grown up together -> C=1 CRM[which(CRM< 0.20, arr.ind = T)]=0 # other less related individuals have no shared environment -> C=0 # Check how many 1 and 0 we have table(CRM[upper.tri(CRM)]) # 978 families table(diag(CRM)) # check diagonal # We get the info about family ID and IID - needed to create the .grm.id file fam=GRM$id colnames(fam)=c("fam", "id") # We get this package which allows writing a GRM # install.packages("genio") library(genio) # needed to write_grm genio::write_grm(kinship = CRM, name = "ozCRM", fam = fam ) ``` ## Fit several random effects in GCTA ```{bash, message=FALSE, warning=FALSE, eval=T} # We create this text file, which stores the name of the different GRM/CRM matrices so GCTA knows which to open echo "ozGRM" > GRM-CRM.txt echo "ozCRM" >> GRM-CRM.txt head GRM-CRM.txt ./gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS1.ACE --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS2.ACE --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS3.ACE --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS4.ACE --reml --reml-est-fix ./gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS5.ACE --reml --reml-est-fix grep X_7 *ACE.log > gcta_results_ACE.txt ``` ## Format results ```{R, message=FALSE, warning=FALSE, eval=T} # We can read in the gcta results res<-as.data.frame(read.table("gcta_results_ACE.txt")) names(res)<-c("Model","beta","se") # To calculate R2 we scale beta into a marginal correlation and then square it # We are calculating this and adding it to the res object res$r2<-rbind((res[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2, (res[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2, (res[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2, (res[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2, (res[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2) # To test the significance of the betas we can use t-tests # We are calculating this and adding it to the res object res$tStat<-res$beta/res$se # Follows a student distribution res$pval<-(1-pt(q=res$tStat, df = 1887, lower.tail = T))*2 # Now we can look at the results res ```