# Copyright 2019-2020 by Robert M. Kirkpatrick
# Licensed under CC BY 4.0 <http://creativecommons.org/licenses/by/4.0/>

# This is an adaptation of a script previously used at the 2017 AGES Workshop at Virginia Commonwealth University.

#This script demonstrates the use of the GREML feature in a simple but realistic example.
#It first loads a genomic-relatedness matrix (GRM) and a file containing the phenotype and a covariate.  Then, it 
#fits a simple GREML model to estimate additive-genetic variance, residual variance, and heritability.
#The GRM is 1000x1000, and was computed from 50,000 simulated SNPs in linkage equilibrium.
#Note that this analysis is simple enough that it could be done in GCTA (and GCTA would do it more quickly).

require(OpenMx)
#We'll use SLSQP:
mxOption(NULL,"Default optimizer","SLSQP")
#More threads means faster performance but larger memory footprint
#(however, this script cannot use more than 2 threads):
mxOption(NULL,"Number of threads",2)
options(mxCondenseMatrixSlots=TRUE)  #<--Saves memory
#You need to set R's working directory to the directory containing the data files for this demo.
#(i.e., YOU MUST CHANGE THE NEXT LINE TO REFLECT WHERE, IN YOUR HOME DIRECTORY, YOU'VE PLACED THE DATA FILES):
setwd("/faculty/kirkpatrick/2020/GREML/data/")

#Total number of participants:
N <- 1000

# Load and check data: ##################################################################

#Load the GRM's data.  Argument 'prefix' to ReadGRMBin() should be everything in the filename and path of one of the GRM's
#that precedes the ".grm" :
grmstuff <- omxReadGRMBin(prefix="AGES2017GRM",returnList=T)
closeAllConnections()
str(grmstuff)
#^^^grmstuff will be a list of 4 elements:
#     $diag is a numeric vector containing the GRM's diagonal elements.
#     $off is a numeric vector containing the GRM's off-diagonal elements.
#     $id is a dataframe that contains the family and individual IDs orresponding to the rows and columns of the GRM.
#     $N is the number of markers used to compute the GRM.

#Use the elements of grmstuff to make a proper GRM:
#Initialize:
GRM <- matrix(
	0.0,nrow=N,ncol=N,
	dimnames=list(grmstuff$id$V1+grmstuff$id$V2,grmstuff$id$V1+grmstuff$id$V2))
#^^^We're giving the GRM dimnames based on IDs.
#Populate the upper triangle of the GRM, excluding diagonal, with grmstuff$off:
GRM[!lower.tri(GRM,diag=T)] <- grmstuff$off
#Populate the lower triangle, excluding diagonal, of the GRM:
GRM <- GRM + t(GRM)
#Populate the diagonal of the GRM:
diag(GRM) <- grmstuff$diag
#Look at upper left corner of GRM:
GRM[1:5,1:5]

#Load phenotype file:
phenfile <- read.csv("GREMLphenodata.csv",header=T)
#The columns are family ID, individual ID, phenotype, and covariate:
head(phenfile)
#There is one missing value for the phenotype and for the covariate:
sum(is.na(phenfile$y))
sum(is.na(phenfile$x))
#Give phenfile rownames based on IDs:
rownames(phenfile) <- phenfile$famid + phenfile$id
#Check to make sure that there are no individuals represented in the GRM but not in the phenotype file (there won't
#be, since the data are simulated, but this is a good check to make when working with real data):
all(rownames(GRM) %in% rownames(phenfile)) #<--Should return TRUE.
#And check vice versa:
all(rownames(phenfile) %in% rownames(GRM)) #<--Should also be TRUE.
#In fact, the rows of phenfile are matched with those of the GRM:
all(rownames(phenfile)==rownames(GRM)) #<--TRUE.


# Create MxData object & expectation; get empirical start values: ##########################################

#The MxData object.  We will use gremldat$yX as the raw dataset to be used in our MxModel:
mxdat <- mxData(observed=cbind(y=phenfile$y,x=phenfile$x), type="raw", sort=FALSE)
#^^^N.B. the 'sort=FALSE' argument!  I CANNOT EMPHASIZE ENOUGH HOW IMPORTANT THAT IS!!!  By default, OpenMx 
#automatically sorts the rows of a raw dataset at runtime, to try to achieve a perfomance boost.  However, we 
#currently have the rows of y and X sorted exactly the same as in the GRM, so we do not want OpenMx to do any 
#sorting!

#We  tell the GREML espectation that the name of the model-expected covariance matrix will be "V",
#that the one phenotype has column label 'y', that the one covariate has column label 'x', 
#and that a lead column of ones needs to be appended to the 'X' matrix (for the intercept):
ge <- mxExpectationGREML(V="V",yvars="y", Xvars="x", addOnes=T)

#Let's get the residual variance from an OLS regression of the phenotype onto the covariate.  We'll use half
#this residual variance as the start value for the additive-genetic and nonshared-environmental variance 
#components:
orv <- var(lm(y~x, data=phenfile)$residuals)


# Create the MxModel, and run it: ###############################################################################
#(We will create a lot of objects inside the mxModel() statement.  This helps to save memory.)

testmod <- mxModel(
	"GREML_1GRM_1trait", #<--Model name
	#1x1 matrix containing nonshared-environmental (residual) variance component:
	mxMatrix(type = "Full", nrow = 1, ncol=1, free=T, values = orv/2, labels = "ve", 
					 lbound = 0.0001, name = "Ve"),
	#1x1 matrix containing additive-genetic variance component:
	mxMatrix(type = "Full", nrow = 1, ncol=1, free=T, values = orv/2, labels = "va", 
					 name = "Va"),
	#1000x1000 identity matrix--the "relatedness matrix" for the residuals:
	mxMatrix("Iden",nrow=N,name="I"),
	#The GRM:
	mxMatrix("Symm",nrow=N,free=F,values=GRM,name="A"),
	#The model-expected covariance matrix:
	mxAlgebra((A%x%Va) + (I%x%Ve), name="V"), #<--Readable, but could be made more efficient.
	#An MxAlgebra for the heritability:
	mxAlgebra(Va/(Va+Ve), name="h2"),
	mxdat, #<--MxData object
	ge, #<--GREML expectation
	mxFitFunctionGREML() #<--GREML fitfunction
)

#Run and view summary:
testrun <- mxRun(testmod)
summary(testrun)
mxEval(h2,testrun,T)
mxSE(h2,testrun)