---
title: "Day 7 ISGW workshop tutorial"
author: "[Sarah Medland & Baptiste Couvy-Duchesne](https://github.com/baptisteCD)"
date: "`r format(Sys.time(), '%d %B %Y')`"
mail: "baptiste.couvyduchesne@gmail.com"
github: "baptisteCD"
output:
epuRate::ISGW:
toc: TRUE
number_sections: FALSE
code_folding: "show"
---
```{r, echo=FALSE, message=FALSE, warning=FALSE}
# You need these libraries to run this template:
library(rmarkdown) # install.packages("rmarkdown")
library(epuRate) # devtools::install_github("baptisteCD/epuRate", force=TRUE)
# Soft-wrap code in knitting
knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)
```
> Answer sheet for the tutorial
# Setting up environment
```{bash, message=FALSE, warning=FALSE, eval=F}
mkdir -p day7
cd day7
cp -r /faculty/sarah/2022/Day7/* .
```
# PRS calculation using PRSice
```{bash, message=FALSE, warning=FALSE, eval=F}
# PRSice code - run this in the ssh terminal window
Rscript /usr/local/bin/PRSice.R --dir . --prsice /usr/local/bin/PRSice --base weights.prs --target ozbmi --thread 1 --snp SNP --chr CHR --bp BP --A1 A1 --A2 A2 --stat BETA --pvalue P --beta --pheno ozht.pheno --score sum --all-score --bar-levels 0.00000005,0.000001,0.0001,0.01,1 --fastscore --quantile 5 --quant-ref 1 --binary-target F --no-regress
```
# Open tables and merge
```{R, message=FALSE, warning=FALSE, eval=T}
pheno<-read.table("ozht.pheno", header=T)
covar<-read.table("ozht.cov", header=T)
PRS<-read.table("PRSice.all_score", header=T)
head(PRS)
names(PRS)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5")
head(PRS)
pheno$ht<-pheno$ht*100
# Best to use names instead of column number
temp=merge(pheno, covar[,2:8], by="IID")
longData=merge(temp, PRS[,2:7], by="IID")
# Run models
nullModel<-lm(ht~age+sex+PC1+PC2+PC3+PC4, data=longData)
summary(nullModel)
#Then we are going to run a model that includes the covariates and the first PRS
thresh1<-lm(ht~age+sex+PC1+PC2+PC3+PC4+PRS1, data=longData)
summary(thresh1)
#We can compute the difference in R2 between the 2 models using the following code
summary(thresh1)$r.squared-summary(nullModel)$r.squared
```
# Format data - to one row per family (long -> wide)
```{R, message=FALSE, warning=FALSE, eval=T}
zyg<-read.table("ozht.zyg", header=T)
#We will then merge on the zyg data using the individual ID "IID" as the key.
longData1=merge(longData, zyg[,2:5], by="IID")
#Take a look at the result to check that it worked
head(longData1)
#For openMx we need our data to be in a wide format with one line per family so we will need to reshape the data.
#In this code the variables listed in v.names are present for both twins
#The variable Twin specifies if the person is twin 1 or 2
wideData=reshape(data = longData1, direction = "wide", v.names = c("IID", "ht", "age", "sex", "PC1", "PC2", "PC3", "PC4", "PRS1", "PRS2", "PRS3", "PRS4", "PRS5"), idvar = "FID", timevar = "Twin", sep="_" )
# We will replace all missingness in the definition variables with 0
# the people with missing definition variables don't have phenotypic data so this will not bais estimates
wideData[,c(6:12,19:25)][is.na(wideData[,c(6:12,19:25)])] <- 0
```
# OpenMx model
```{R, message=FALSE, warning=FALSE, eval=T}
library(OpenMx)
source("miFunctions5272022.R")
# Select Variables for Analysis
vars <- 'ht' # list of variables names
nv <- 1 # number of variables
ntv <- nv*2 # number of total variables
depVar <- c("ht_1","ht_2") # dependent variables
indVar <- c("age_1", "sex_1", "PC1_1", "PC2_1", "PC3_1", "PC4_1", "PRS1_1",
"age_2", "sex_2", "PC1_2", "PC2_2", "PC3_2", "PC4_2", "PRS1_2") # independent variables or predictors
selVars <- c(depVar,indVar)
# Select Data for Analysis
mzData <- subset(wideData, MZDZ==1, selVars)
dzData <- subset(wideData, MZDZ==2, selVars)
# Set Starting Values
svMe <- 170 # start value for means
svVa <- 100 # start value for variance
lbVa <- 0.00001 # lower bound for variance
svBe <- 0.0001 # start value for means
# Create Data Objects for Multiple Groups
dataMZ <- mxData( observed=mzData, type="raw" )
dataDZ <- mxData( observed=dzData, type="raw" )
# Objects to hold definition variables for the regression
dPC1 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC1_1","data.PC1_2"), name="dPC1" )
dPC2 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC2_1","data.PC2_2"), name="dPC2" )
dPC3 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC3_1","data.PC3_2"), name="dPC3" )
dPC4 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC4_1","data.PC4_2"), name="dPC4" )
dsex <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.sex_1","data.sex_2"), name="dsex" )
dage <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.age_1","data.age_2"), name="dage" )
dPRS <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PRS1_1","data.PRS1_2"), name="dPRS" )
# Objects to hold betas for the covariates
bPC1 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b1", name="bPC1" )
bPC2 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b2", name="bPC2" )
bPC3 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b3", name="bPC3" )
bPC4 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b4", name="bPC4" )
bsex <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b5", name="bsex" )
bage <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b6", name="bage" )
bPRS <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b7", name="bPRS" )
#Then we will build the model for the means that includes the regression tems
#(we are using Kronecker products here to mutliply the values of the independent variables by the betas)
meanG <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svMe, labels=labVars("mean",vars), name="meanG" )
ExpMean <- mxAlgebra( expression= (meanG + bPC1%x%dPC1 + bPC2%x%dPC2 + bPC3%x%dPC3 + bPC4%x%dPC4 + bsex%x%dsex + bage%x%dage + bPRS%x%dPRS) , name="expMean" )
#and do some house keeping (making a list of the objects so we can pull these into the model later)
pars<- c(bPC1, bPC2, bPC3, bPC4, bsex, bage, bPRS, meanG)
defs<- c(dPC1, dPC2, dPC3, dPC4, dsex, dage, dPRS)
# Create Algebra for expected Variance/Covariance Matrices
covMZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv),labels=c("var","cMZ21","var"), name="covMZ" )
covDZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv), labels=c("var","cDZ21","var"), name="covDZ" )
#Next we build the expection and model objects
# Create Expectation Objects for Multiple Groups
expMZ <- mxExpectationNormal( covariance="covMZ", means="expMean", dimnames=depVar )
expDZ <- mxExpectationNormal( covariance="covDZ", means="expMean", dimnames=depVar )
funML <- mxFitFunctionML()
# Create Model Objects for Multiple Groups
modelMZ <- mxModel( pars, defs, ExpMean, covMZ, dataMZ, expMZ, funML, name="MZ" )
modelDZ <- mxModel( pars, defs, ExpMean, covDZ, dataDZ, expDZ, funML, name="DZ" )
multi <- mxFitFunctionMultigroup( c("MZ","DZ") )
# Build Saturated Model
model_wPRS <- mxModel( "wPRS", pars, modelMZ, modelDZ, multi )
# Run model
fit_wPRS <- mxRun( model_wPRS )
(sum_wPRS <- summary( fit_wPRS ))
#And then run a submodel whic drops the PRS effect from the model
# Drop the PRS from the model (running the null model)
model_Null <- mxModel( fit_wPRS, name="noPRS" )
model_Null <- omxSetParameters( model_Null, labels="b7", free=FALSE, values=0 )
fit_Null <- mxRun( model_Null)
(sum_Null <- summary( fit_Null ))
#We can work out the significance of the PRS effect by comparing the fit of the models
mxCompare( fit_wPRS, fit_Null )
#and calculate r2 as the difference in variance between the models gives variance explained by PRS, divided by total variance
(fit_Null$MZ$covMZ$values[1,2]-fit_wPRS$MZ$covMZ$values[1,2])/fit_Null$MZ$covMZ$values[1,2]
```
# Calculate GRM using GCTA
```{bash, message=FALSE, warning=FALSE, eval=T}
./gcta64 --bfile ozbmi --make-grm --out ozGRM
```
# Open GRM and draw some plots
```{R, message=FALSE, warning=FALSE, eval=T}
# Open GRM and visualise off-diagonal distribution
ReadGRMBin=function(prefix, AllN=F, size=4){
sum_i=function(i){
return(sum(1:i))
}
BinFileName=paste(prefix,".grm.bin",sep="")
NFileName=paste(prefix,".grm.N.bin",sep="")
IDFileName=paste(prefix,".grm.id",sep="")
id = read.table(IDFileName)
n=dim(id)[1]
BinFile=file(BinFileName, "rb");
grm=readBin(BinFile, n=n*(n+1)/2, what=numeric(0), size=size)
NFile=file(NFileName, "rb");
if(AllN==T){
N=readBin(NFile, n=n*(n+1)/2, what=numeric(0), size=size)
}
else N=readBin(NFile, n=1, what=numeric(0), size=size)
i=sapply(1:n, sum_i)
return(list(diag=grm[i], off=grm[-i], id=id, N=N))
}
GRM=ReadGRMBin(prefix = "ozGRM")
hist(GRM$diag, breaks=100) # Large values can indicate data problems or outstanding homozygosity rate
#
hist(GRM$off, breaks = 100)
# This plot isn't very informative as most relationship coefficents are close to 0. We can update our plot to only show relationship coefficents greater than 0.05 using the following code
hist(GRM$off[which(GRM$off>0.05)], breaks=100)
#This plot clearly shows we have a lot of siblings and MZ twin pairs. If we zoom in even further we might spot those cousins
hist(GRM$off[which(GRM$off>0.05)], breaks=100, xlim=c(0.05,0.3))
```
# Write files for GCTA analysis
```{R, message=FALSE, warning=FALSE, eval=T}
prs=read.table("PRSice.all_score", header=T)
covar<-read.table("ozht.cov", header=T)
pheno<-read.table("ozht.pheno", header=T)
#Then rescaling and renaming
pheno$ht<-pheno$ht*100
names(prs)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5")
#Then merging the data into 1 dataframe
all=merge(covar, prs, by=c("IID", "FID"))
all=merge(all, pheno, by=c("IID", "FID"))
# Then we'll write out the covariate files for the continuous covariates that we want to include in the GCTA model
# It's important that the PRS is in the last column of this file
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS1")], "ozht.prs1.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS2")], "ozht.prs2.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS3")], "ozht.prs3.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS4")], "ozht.prs4.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS5")], "ozht.prs5.qcovar", quote = F, row.names = F)
# Then we'll write out the covariate files for the binary covariates that we want to include in the GCTA model
write.table(all[,c("FID", "IID", "sex")], "ozht.covar", quote = F, row.names = F)
# Then we'll write out the phenotype files for the GCTA model
write.table(all[,c("FID", "IID", "ht")], "ozht.phen", quote = F, row.names = F)
```
# Run GCTA with fixed and random effect (AE model)
```{bash, message=FALSE, warning=FALSE, eval=T}
./gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS1 --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS2 --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS3 --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS4 --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS5 --reml --reml-est-fix
#GCTA will make a .log file and an .hsq file for each model.
#The beta and SE of the PRS effect can be found in the line labeled X_7 in the log files.
#We can pull these out of the log files and save them to a file with the following code
grep X_7 *.log > gcta_results.txt
```
# Format GCTA outputs predictivness
```{R, message=FALSE, warning=FALSE, eval=T}
# We can read in the gcta results
res<-as.data.frame(read.table("gcta_results.txt"))
names(res)<-c("Model","beta","se")
# To calculate R2 we scale beta into a marginal correlation and then square it
# We are calculating this and adding it to the res object
res$r2<-rbind((res[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2,
(res[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2,
(res[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2,
(res[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2,
(res[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2)
# To test the significance of the betas we can use t-tests
# We are calculating this and adding it to the res object
res$tStat<-res$beta/res$se # Follows a student distribution
res$pval<-(1-pt(q=res$tStat, df = 1887, lower.tail = T))*2
# Now we can look at the results
res
```
# Make barplots to summarise associations
```{R, message=FALSE, warning=FALSE, eval=T}
r2 <- data.frame(
name = c("5e-08", "1e-06", "0.0001", "0.01", "1"),
R2 = c(1.236,1.913,2.218,2.340,2.561 ),
P=c(0.000195,5.04e-06,6.91e-07,2.67e-07, 6.12e-08)
)
# Increase bottom margin
par(mar=c(4,4,4,4))
# Barplot
#jpeg("height.R2.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo")
my_bar <- barplot(r2$R2 , border=F , names.arg=r2$name ,
las=2 ,
col=c('skyblue1','skyblue2','skyblue3','skyblue4') ,
ylim=c(0,3) ,
main="Predicting height from height PRS",
ylab="% variance explained", xlab="PRS Thresholds")
# Add P values to the bars
text(my_bar, r2$R2+0.04 , paste("P=", r2$P, sep="") ,cex=1)
#dev.off()
# Next we'll plot height by quintile of PRS (often these plots are made with deciles but the sample is small so we're going to use quintiles here)
library(Hmisc)
library(gplots)
# the following command divides the data into quintiles.
longData$percentiles <- factor(cut2(longData$PRS5, g=5, levels.mean=TRUE))
# Plot the mean of height by precentile with 95% CIs
#jpeg("Quintile.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo")
plotmeans(ht ~ percentiles, data = longData, xlab="Quintile of PRS", ylab="Height in Cm",legends = c("1","2","3","4","5"))
#dev.off()
```
# Optional analysis - adapt code to fit ACE model in GCTA
## Make matrix of variance covariance for the shared environment
```{r, message=FALSE, warning=FALSE, eval=T}
# We can make the CRM by modyfing the GRM
hist(GRM$off[which(GRM$off>0.20)], breaks=100) # Remember that the sample contains MZ and DZ twins (or siblings)
# Format GRM into a NxN matrix (simpler to write later)
asNxNGRM<-function(BRMbin){
mat<-matrix(0, nrow = length(BRMbin$diag), ncol = length(BRMbin$diag))
mat[upper.tri(mat)]<-BRMbin$off
mat<-mat+t(mat)
diag(mat)<-BRMbin$diag
colnames(mat)<-BRMbin$id[,2]
rownames(mat)<-BRMbin$id[,2]
return(mat)
}
# convert GRM into NxN matrix
GRMmat=asNxNGRM(GRM)
# We can build the CRM as a modified GRM
CRM=GRMmat
CRM[which(CRM> 0.20, arr.ind = T)]=1 # we assume that individuals with GRM>0.2 have grown up together -> C=1
CRM[which(CRM< 0.20, arr.ind = T)]=0 # other less related individuals have no shared environment -> C=0
# Check how many 1 and 0 we have
table(CRM[upper.tri(CRM)]) # 978 families
table(diag(CRM)) # check diagonal
# We get the info about family ID and IID - needed to create the .grm.id file
fam=GRM$id
colnames(fam)=c("fam", "id")
# We get this package which allows writing a GRM
# install.packages("genio")
library(genio) # needed to write_grm
genio::write_grm(kinship = CRM, name = "ozCRM", fam = fam )
```
## Fit several random effects in GCTA
```{bash, message=FALSE, warning=FALSE, eval=T}
# We create this text file, which stores the name of the different GRM/CRM matrices so GCTA knows which to open
echo "ozGRM" > GRM-CRM.txt
echo "ozCRM" >> GRM-CRM.txt
head GRM-CRM.txt
./gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS1.ACE --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS2.ACE --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS3.ACE --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS4.ACE --reml --reml-est-fix
./gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS5.ACE --reml --reml-est-fix
grep X_7 *ACE.log > gcta_results_ACE.txt
```
## Format results
```{R, message=FALSE, warning=FALSE, eval=T}
# We can read in the gcta results
res<-as.data.frame(read.table("gcta_results_ACE.txt"))
names(res)<-c("Model","beta","se")
# To calculate R2 we scale beta into a marginal correlation and then square it
# We are calculating this and adding it to the res object
res$r2<-rbind((res[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2,
(res[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2,
(res[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2,
(res[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2,
(res[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2)
# To test the significance of the betas we can use t-tests
# We are calculating this and adding it to the res object
res$tStat<-res$beta/res$se # Follows a student distribution
res$pval<-(1-pt(q=res$tStat, df = 1887, lower.tail = T))*2
# Now we can look at the results
res
```