# PRSice code - run this in the ssh terminal window

Rscript /usr/local/bin/PRSice.R --dir . --prsice /usr/local/bin/PRSice  --base weights.prs --target ozbmi  --thread 1  --snp SNP --chr CHR --bp BP --A1 A1 --A2 A2 --stat BETA --pvalue P  --beta --pheno ozht.pheno --score sum --all-score --bar-levels 0.00000005,0.000001,0.0001,0.01,1  --fastscore --quantile 5 --quant-ref 1 --binary-target F --no-regress


#***************************************************************************************************************************
# R code run in R Studio

rm(list=ls())
setwd("~/2022/working")

# Read in the phenotype covariance and prs files
pheno<-read.table("ozht.pheno", header=T)
covar<-read.table("ozht.cov", header=T)
PRS<-read.table("PRSice.all_score", header=T)

#lets give the PRS variables some more user friendly names
head(PRS)
names(PRS)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5")
head(PRS)

#and convert the height to cm
pheno$ht<-pheno$ht*100

# Merge the pheno covar PRS and zyg files
# we are using [ ] to subset the columns so that the FID is not duplicated in the merged files
temp=merge(pheno, covar[,2:8], by="IID")
longData=merge(temp, PRS[,2:7], by="IID")

#Next we are going to run a Null model that includes the covariates but it doesn't include any PRS
nullModel<-lm(ht~age+sex+PC1+PC2+PC3+PC4, data=longData)
summary(nullModel)

#Then we are going to run a model that includes the covariates and the first PRS
thresh1<-lm(ht~age+sex+PC1+PC2+PC3+PC4+PRS1, data=longData)
summary(thresh1)

#We can compute the difference in R2 between the 2 models using the following code
summary(thresh1)$r.squared-summary(nullModel)$r.squared

#***********************************************************************************************************************
# Your task is to run a series of lm models to evaluate the predictive abilites of each of the 5 PRS.
#***********************************************************************************************************************


# Read in the zygosity information
zyg<-read.table("ozht.zyg", header=T)

#We will then merge on the zyg data using the individual ID "IID" as the key.
longData1=merge(longData, zyg[,2:5], by="IID")

#Take a look at the result to check that it worked
head(longData1)

#For openMx we need our data to be in a wide format with one line per family so we will need to reshape the data.
#In this code the variables listed in v.names are present for both twins
#The variable Twin specifies if the person is twin 1 or 2
wideData=reshape(data = longData1, direction = "wide", v.names = c("IID", "ht", "age", "sex", "PC1", "PC2", "PC3", "PC4", "PRS1", "PRS2", "PRS3", "PRS4", "PRS5"), idvar = "FID", timevar = "Twin", sep="_")
head(wideData)

# We will replace all missingness in the definition variables with 0
# the people with missing definition variables don't have phenotypic data so this will not bais estimates
wideData[,c(6:16,19:29)][is.na(wideData[,c(6:16,19:29)])] <- 0


#***********************************************************************************************************************
# Next we are going to run the model in openMx to correct for relatedness


# ----------------------------------------------------------------------------------------------------------------------
# Author:  Sarah Medland (adpted from oneSATca.R by Hermine Maes)
# Date: 01 06 2022
#
# Twin Univariate Saturated model with one means and one variance across multiple groups - with PRS as a covariate
# Matrix style model - Raw data - Continuous data
# -------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|


# Load Libraries & Options
library(OpenMx)
mxOption( NULL, "Default optimizer", "NPSOL" )
source("miFunctions5272022.R")


# Create Output
#filename <- "SaturatedModel_withPRS"
#sink(paste(filename,".Ro",sep=""), append=FALSE, split=TRUE)
# ----------------------------------------------------------------------------------------------------------------------

# Data used in this script will be the wideData object
# ----------------------------------------------------------------------------------------------------------------------

# Select Variables for Analysis
vars <- 'ht' # list of variables names
nv <- 1 # number of variables
ntv <- nv*2 # number of total variables
depVar <- c("ht_1","ht_2") # dependent variables
indVar <- c("age_1", "sex_1", "PC1_1", "PC2_1", "PC3_1", "PC4_1", "PRS1_1",
"age_2", "sex_2", "PC1_2", "PC2_2", "PC3_2", "PC4_2", "PRS1_2") # independent variables or predictors
selVars <- c(depVar,indVar)


# Select Data for Analysis
mzData <- subset(wideData, MZDZ==1, selVars)
dzData <- subset(wideData, MZDZ==2, selVars)

# Set Starting Values
svMe <- 170 # start value for means
svVa <- 100 # start value for variance
lbVa <- 0.00001 # lower bound for variance
svBe <- 0.0001 # start value for means

# Create Data Objects for Multiple Groups
dataMZ <- mxData( observed=mzData, type="raw" )
dataDZ <- mxData( observed=dzData, type="raw" )


# ----------------------------------------------------------------------------------------------------------------------
# PREPARE MODEL
# Create Algebra for expected Mean Matrices

# Objects to hold definition variables for the regression
dPC1 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC1_1","data.PC1_2"), name="dPC1" )
dPC2 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC2_1","data.PC2_2"), name="dPC2" )
dPC3 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC3_1","data.PC3_2"), name="dPC3" )
dPC4 <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PC4_1","data.PC4_2"), name="dPC4" )
dsex <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.sex_1","data.sex_2"), name="dsex" )
dage <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.age_1","data.age_2"), name="dage" )
dPRS <- mxMatrix( type="Full", nrow=1, ncol=2, free=FALSE, labels=c("data.PRS1_1","data.PRS1_2"), name="dPRS" )
# Objects to hold betas for the covariates
bPC1 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b1", name="bPC1" )
bPC2 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b2", name="bPC2" )
bPC3 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b3", name="bPC3" )
bPC4 <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b4", name="bPC4" )
bsex <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b5", name="bsex" )
bage <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b6", name="bage" )
bPRS <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svBe, label="b7", name="bPRS" )
# Build the means model
meanG <- mxMatrix( type="Full", nrow=1, ncol=1, free=TRUE, values=svMe, labels=labVars("mean",vars), name="meanG" )
ExpMean <- mxAlgebra( expression= (meanG + bPC1%x%dPC1 + bPC2%x%dPC2 + bPC3%x%dPC3 + bPC4%x%dPC4 + bsex%x%dsex + bage%x%dage + bPRS%x%dPRS) , name="expMean" )
pars<- c(bPC1, bPC2, bPC3, bPC4, bsex, bage, bPRS, meanG)
defs<- c(dPC1, dPC2, dPC3, dPC4, dsex, dage, dPRS)

# Create Algebra for expected Variance/Covariance Matrices
covMZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv),
                   labels=c("var","cMZ21","var"), name="covMZ" )
covDZ <- mxMatrix( type="Symm", nrow=ntv, ncol=ntv, free=TRUE, values=c(.1,.05,.1), lbound=valDiag(lbVa,ntv),
                   labels=c("var","cDZ21","var"), name="covDZ" )

# Create Expectation Objects for Multiple Groups
expMZ <- mxExpectationNormal( covariance="covMZ", means="expMean", dimnames=depVar )
expDZ <- mxExpectationNormal( covariance="covDZ", means="expMean", dimnames=depVar )
funML <- mxFitFunctionML()

# Create Model Objects for Multiple Groups
modelMZ <- mxModel( pars, defs, ExpMean, covMZ, dataMZ, expMZ, funML, name="MZ" )
modelDZ <- mxModel( pars, defs, ExpMean, covDZ, dataDZ, expDZ, funML, name="DZ" )
multi <- mxFitFunctionMultigroup( c("MZ","DZ") )

# Build Saturated Model 
model_wPRS <- mxModel( "wPRS", pars, modelMZ, modelDZ, multi )

# ----------------------------------------------------------------------------------------------------------------------
# RUN MODEL
# Run Saturated Model
fit_wPRS <- mxRun( model_wPRS )
(sum_wPRS <- summary( fit_wPRS ))

# ----------------------------------------------------------------------------------------------------------------------
# RUN SUBMODELS
# Drop the PRS from the model (running the null model)
model_Null <- mxModel( fit_wPRS, name="noPRS" )
model_Null <- omxSetParameters( model_Null, labels="b7", free=FALSE, values=0 )
fit_Null <- mxRun( model_Null)
(sum_Null <- summary( fit_Null ))

# ----------------------------------------------------------------------------------------------------------------------
# Print Comparative Fit Statistics
mxCompare( fit_wPRS, fit_Null )

# calculate r2
# difference in variance between the models gives variance explained by PRS
# divided by total variance to express as a %
(fit_Null$MZ$covMZ$values[1,2]-fit_wPRS$MZ$covMZ$values[1,2])/fit_Null$MZ$covMZ$values[1,2]

# ----------------------------------------------------------------------------------------------------------------------
#sink()

#***********************************************************************************************************************
# Your task is to run a series of openMx models to evaluate the predictive abilites of each of the 5 PRS.
#***********************************************************************************************************************


- remember to find and replace all mentions of the PRS variable in the openMx code

#***********************************************************************************************************************
# Next we are going to run the analyses in GCTA
#***********************************************************************************************************************


!!! Run this in the ssh terminal
# Make GRM using GCTA  
gcta64 --bfile ozbmi --make-grm --out ozGRM


!!!!Run the code below in R

#We can take a look at the contents of the GRM using the following code
#  Open GRM and visualise off-diagonal distribution
ReadGRMBin=function(prefix, AllN=F, size=4){
  sum_i=function(i){
    return(sum(1:i))
  }
  BinFileName=paste(prefix,".grm.bin",sep="")
  NFileName=paste(prefix,".grm.N.bin",sep="")
  IDFileName=paste(prefix,".grm.id",sep="")
  id = read.table(IDFileName)
  n=dim(id)[1]
  BinFile=file(BinFileName, "rb");
  grm=readBin(BinFile, n=n*(n+1)/2, what=numeric(0), size=size)
  NFile=file(NFileName, "rb");
  if(AllN==T){
    N=readBin(NFile, n=n*(n+1)/2, what=numeric(0), size=size)
  }
  else N=readBin(NFile, n=1, what=numeric(0), size=size)
  i=sapply(1:n, sum_i)
  return(list(diag=grm[i], off=grm[-i], id=id, N=N))
}

GRM=ReadGRMBin(prefix = "ozGRM")

#We can look at the distribution of diagonal values (ie the covariation of a person with themselves) using the following code.

hist(GRM$diag, breaks=100) # Large values can indicate data problems or outstanding homozygosity rate

#Usually we would expect the values to be close to 1. This teaching example only includes ~10,000 snps so there is more variation here than we would usually expect.

#How many off diagonal elements are in this matrix (ie covarations between people)?
length(GRM$off) 

#We can look at the off diagnoal elements using the following code
hist(GRM$off, breaks = 100)

#This plot isn't very informative as most relationship coefficents are close to 0. We can update our plot to only show relationship coefficents greater than 0.05 using the following code
hist(GRM$off[which(GRM$off>0.05)], breaks=100)

#This plot clearly shows we have a lot of siblings and MZ twin pairs. If we zoom in even further we might spot those cousins
hist(GRM$off[which(GRM$off>0.05)], breaks=100, xlim=c(0.05,0.3)

#We will use R to write out the files to run GCTA
#We'll start by reading in the data files again incase they have been lost
prs=read.table("PRSice.all_score", header=T)
covar<-read.table("ozht.cov", header=T)
pheno<-read.table("ozht.pheno", header=T)

#Then rescaling and renaming
pheno$ht<-pheno$ht*100
names(prs)<-c("FID","IID","PRS1","PRS2","PRS3","PRS4","PRS5")
 
#Then merging the data into 1 dataframe
all=merge(covar, prs, by=c("IID", "FID"))
all=merge(all, pheno, by=c("IID", "FID"))

# Then we'll write out the covariate files for the continuous covariates that we want to include in the GCTA model
# It's important that the PRS is in the last column of this file
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS1")], "ozht.prs1.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS2")], "ozht.prs2.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS3")], "ozht.prs3.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS4")], "ozht.prs4.qcovar", quote = F, row.names = F)
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4", "PRS5")], "ozht.prs5.qcovar", quote = F, row.names = F)

# Then we'll write out the covariate files for the binary covariates that we want to include in the GCTA model
write.table(all[,c("FID", "IID", "sex")], "ozht.covar", quote = F, row.names = F)

# Then we'll write out the phenotype files for the GCTA model
write.table(all[,c("FID", "IID", "ht")], "ozht.phen", quote = F, row.names = F)


 
#Now we are ready to run our gcta analyses.
!!!You will run these analyses by pasting this code into the ssh terminal

gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS1 --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS2 --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS3 --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS4 --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --grm ozGRM --out ozhtPRS5 --reml --reml-est-fix

#GCTA will make a .log file and an .hsq file for each model. 
#The beta and SE of the PRS effect can be found in the line labeled X_7 in the log files.
#We can pull these out of the log files and save them to a file with the following code
 
grep X_7 *.log > gcta_results.txt
 
!!!! Back to R
# We can read in the gcta results
res<-as.data.frame(read.table("gcta_results.txt"))
names(res)<-c("Model","beta","se")

# To calculate R2 we scale beta into a marginal correlation and then square it
# We are calculating this and adding it to the res object
res$r2<-rbind((res[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2,
	(res[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2,
	(res[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2,
	(res[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2,
	(res[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2)


# To test the significance of the betas we can use t-tests 
res$tStat<-res$beta/res$se # Follows a student distribution
res$pval<-(1-pt(q=tStat, df = 1887, lower.tail = T))*2 

# Now we can look at the results 
res



#***********************************************************************************************************************
# Now we are going to make some plots
#***********************************************************************************************************************


# Make barplots to summarise associations

r2 <- data.frame(
  name = c("5e-08", "1e-06", "0.0001", "0.01", "1"),
  R2 = c(1.236,1.913,2.218,2.340,2.561 ),
  P=c(0.000195,5.04e-06,6.91e-07,2.67e-07, 6.12e-08)
)
# Increase bottom margin
par(mar=c(4,4,4,4))
# Barplot
#jpeg("height.R2.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo")
my_bar <- barplot(r2$R2 , border=F , names.arg=r2$name ,
                  las=2 ,
                  col=c('skyblue1','skyblue2','skyblue3','skyblue4') ,
                  ylim=c(0,3) ,
                  main="Predicting height from height PRS",
                  ylab="% variance explained", xlab="PRS Thresholds")
# Add P values to the bars
text(my_bar, r2$R2+0.04 , paste("P=", r2$P, sep="") ,cex=1)
#dev.off()

# Next we'll plot height by quintile of PRS (often these plots are made with deciles but the sample is small so we're going to use quintiles here)
library(Hmisc)
library(gplots)
# the following command divides the data into quintiles.
longData$percentiles <- factor(cut2(longData$PRS5, g=5, levels.mean=TRUE))

# Plot the mean of height by precentile with 95% CIs
#jpeg("Quintile.jpg", width = 800, height = 800, quality=75, bg = "white", type = "cairo")
plotmeans(ht ~ percentiles, data = longData, xlab="Quintile of PRS", ylab="Height in Cm",legends = c("1","2","3","4","5"))
#dev.off()
 


#****************************************************************************************************************

# STRETCH GOAL - Optional analysis - adapt code to fit ACE model in GCTA

# We can make the CRM by modyfing the GRM
hist(GRM$off[which(GRM$off>0.20)], breaks=100) # Remember that the sample contains MZ and DZ twins (or siblings)

# Format GRM into a NxN matrix (simpler to write later)
asNxNGRM<-function(BRMbin){
  mat<-matrix(0, nrow = length(BRMbin$diag), ncol = length(BRMbin$diag))
mat[upper.tri(mat)]<-BRMbin$off
mat<-mat+t(mat)
diag(mat)<-BRMbin$diag
colnames(mat)<-BRMbin$id[,2]
rownames(mat)<-BRMbin$id[,2]
  return(mat)
}

# convert GRM into NxN matrix
GRMmat=asNxNGRM(GRM)

# We can build the CRM as a modified GRM
CRM=GRMmat
CRM[which(CRM> 0.20, arr.ind = T)]=1 # we assume that individuals with GRM>0.2 have grown up together -> C=1
CRM[which(CRM< 0.20, arr.ind = T)]=0 # other less related individuals have no shared environment -> C=0

# Check how many 1 and 0 we have
table(CRM[upper.tri(CRM)]) # 978 families
table(diag(CRM)) # check diagonal

# We get the info about family ID and IID - needed to create the  .grm.id file
fam=GRM$id 
colnames(fam)=c("fam", "id")

# We get this package which allows writing a GRM
# install.packages("genio")
library(genio) # needed to write_grm

genio::write_grm(kinship = CRM, name = "ozCRM", fam = fam )

# We will also write out a continuous covariate file that does not include a PRS
write.table(all[,c("FID", "IID", "age", "PC1", "PC2", "PC3", "PC4")], "ozht.noprs.qcovar", quote = F, row.names = F)




## Fit several random effects in GCTA
!!!!!!!!! This code should be run in terminal

# We create this text file, which stores the name of the different GRM/CRM matrices so GCTA knows which to open
echo "ozGRM" > GRM-CRM.txt
echo "ozCRM" >> GRM-CRM.txt
head GRM-CRM.txt

gcta64 --pheno ozht.phen --qcovar ozht.prs1.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS1.ACE --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs2.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS2.ACE --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs3.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS3.ACE --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs4.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS4.ACE --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.prs5.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozhtPRS5.ACE --reml --reml-est-fix
gcta64 --pheno ozht.phen --qcovar ozht.noprs.qcovar --covar ozht.covar --mgrm GRM-CRM.txt --out ozht.ACE.noPRS --reml --reml-est-fix

grep X_7 *ACE.log > gcta_results_ACE.txt


## Format results
!!!!!!!!! This code should be run in Rstudio

# We can read in the gcta results
resACE<-as.data.frame(read.table("gcta_results_ACE.txt"))
names(resACE)<-c("Model","beta","se")

# To calculate R2 we scale beta into a marginal correlation and then square it
# We are calculating this and adding it to the resACE object
resACE$r2<-rbind((resACE[1,2]/sd(all$ht, na.rm = T)*sd(all$PRS1 , na.rm = T))**2,
    (resACE[2,2]/sd(all$ht, na.rm = T)*sd(all$PRS2 , na.rm = T))**2,
    (resACE[3,2]/sd(all$ht, na.rm = T)*sd(all$PRS3 , na.rm = T))**2,
    (resACE[4,2]/sd(all$ht, na.rm = T)*sd(all$PRS4 , na.rm = T))**2,
    (resACE[5,2]/sd(all$ht, na.rm = T)*sd(all$PRS5 , na.rm = T))**2)


# To test the significance of the betas we can use t-tests
# We are calculating this and adding it to the resACE object
resACE$tStat<-resACE$beta/resACE$se # Follows a student distribution
resACE
# Now we can look at the results
resACE