#This script (by Rob Kirkpatrick) demonstrates the use of the GREML feature in a simple but realistic example.
#It first loads a genomic-relatedness matrix (GRM) and a file containing the phenotype and a covariate.  Then, it 
#fits a simple GREML model to estimate additive-genetic variance, residual variance, and heritability.
#The GRM is 1000x1000, and was computed from 50,000 simulated SNPs in linkage equilibrium.
#Note that this analysis simple enough that it could be done in GCTA (and GCTA would do it more quickly).

require(OpenMx)
options(mxCondenseMatrixSlots=TRUE)  #<--Saves memory
#You need to set R's working directory to the directory containing the data files for this demo.
#(i.e., YOU MUST CHANGE THE NEXT LINE TO REFLECT WHERE, ON YOUR COMPUTER, YOU'VE PLACED THE DATA FILES):
setwd("/home/kirkpatrick/2018/mxGREML_R_scripts/")

#Total number of participants:
N <- 1000

# Load and check data: ##################################################################

#This is a function from GCTA's documentation that loads a binary-format GRM into R:
ReadGRMBin <- function(prefix,
											 AllN=F,
											 size=4){
	sum_i=function(i){
		return(sum(1:i))
	}
	BinFileName=paste(prefix,".grm.bin",sep="")
	NFileName=paste(prefix,".grm.N.bin",sep="")
	IDFileName=paste(prefix,".grm.id",sep="")
	id  =    read.table(IDFileName)
	n=dim(id)[1]
	BinFile=file(BinFileName,
							 "rb");
	grm=readBin(BinFile,
							n=n*(n+1)/2,
							what=numeric(0),
							size=size)
	NFile=file(NFileName,
						 "rb");
	if(AllN==T){
		N=readBin(NFile,
							n=n*(n+1)/2,
							what=numeric(0),
							size=size)
	}
	else
		N=readBin(NFile,
							n=1,
							what=numeric(0),
							size=size)
	i=sapply(1:n,
					 sum_i)
	return(list(diag=grm[i],
							off=grm[-i],
							id=id,
							N=N))
}

#Load the GRM.  Argument 'prefix' to ReadGRMBin() should be everything in the filename and path of one of the GRM's
#that precedes the ".grm" :
grmstuff <- ReadGRMBin(prefix="Boulder2018GRM")
closeAllConnections()
str(grmstuff)
#^^^grmstuff will be a list of 4 elements:
#     $diag is a numeric vector containing the GRM's diagonal elements.
#     $off is a numeric vector containing the GRM's off-diagonal elements.
#     $id is a dataframe that contains the family and individual IDs corresponding to the rows and columns of the GRM.
#     $N is the number of markers used to compute the GRM.

#Use the elements of grmstuff to make a proper GRM:
#Initialize:
GRM <- matrix(
	0.0,nrow=N,ncol=N,
	dimnames=list(grmstuff$id$V1+grmstuff$id$V2,grmstuff$id$V1+grmstuff$id$V2))
#^^^We're giving the GRM dimnames based on IDs.
#Populate the upper triangle of the GRM, excluding diagonal, with grmstuff$off:
GRM[!lower.tri(GRM,diag=T)] <- grmstuff$off
#Populate the lower triangle, excluding diagonal, of the GRM:
GRM <- GRM + t(GRM)
#Populate the diagonal of the GRM:
diag(GRM) <- grmstuff$diag
#Look at upper left corner of GRM:
GRM[1:5,1:5]

#Load phenotype file:
phenfile <- read.csv("GREMLphenodata.csv",header=T)
#The columns are family ID, individual ID, phenotype, and covariate:
head(phenfile)
#There is one missing value for the phenotype and for the covariate:
sum(is.na(phenfile$y))
sum(is.na(phenfile$x))
#Give phenfile rownames based on IDs:
rownames(phenfile) <- phenfile$famid + phenfile$id
#Check to make sure that there are no individuals represented in the GRM but not in the phenotype file (there won't
#be, since the data are simulated, but this is a good check to make when working with real data):
all(rownames(GRM) %in% rownames(phenfile)) #<--Should return TRUE.
#And check vice versa:
all(rownames(phenfile) %in% rownames(GRM)) #<--Should also be TRUE.
#In fact, the rows of phenfile are matched with those of the GRM:
all(rownames(phenfile)==rownames(GRM)) #<--TRUE.


# Use mxGREMLDataHandler(): ################################################

#Give the data-handler the dataset, and tell it that the one phenotype has column label 'y', 
#that the one covariate has column label 'x', and that a lead column of ones needs to be appended to the 'X' 
#matrix (for the intercept):
gremldat <- mxGREMLDataHandler(data=phenfile, yvars="y", Xvars="x", addOnes=T)
#We can see that participants #7 and #77 in phenfile were dropped due to missing data:
str(gremldat)
#We can see that the first column of gremldata$yX is phenotype vector y, and that the remaining columns constitute
#the X matrix--a regression constant and the covariate:
head(gremldat$yX)
#Number of datapoints, post-data-handling:
Np <- nrow(gremldat$yX)
#Drop rows and columns #7 and #77 from the GRM:
GRM <- GRM[-c(7,77),-c(7,77)]


# Create MxData object, expectation, fitfunction, and compute plan: ##########################################

#The MxData object.  We will use gremldat$yX as the raw dataset to be used in our MxModel:
mxdat <- mxData(observed=gremldat$yX, type="raw", sort=FALSE)
#^^^N.B. the 'sort=FALSE' argument!  I CANNOT EMPHASIZE ENOUGH HOW IMPORTANT THAT IS!!!  By default, OpenMx 
#automatically sorts the rows of a raw dataset at runtime, to try to achieve a perfomance boost.  However, we 
#currently have the rows of y and X sorted exactly the same as in the GRM, so we do not want OpenMx to do any 
#sorting!

#We  tell the GREML espectation that the name of the model-expected covariance matrix is "V", and that the dataset
#being input is y horizontally adhered to X (and therefore, it should not call the data-handler at runtime):
ge <- mxExpectationGREML(V="V",dataset.is.yX=TRUE)
#^^^Note: mxExpectationGREML() has argument 'casesToDropFromV', to which we could have passed 
#gremldat$casesToDrop.  In that case, we could define our covariance matrix and its derivatives as 
#full-size NxN matrices, and allow the backend to use the value of 'casesToDropFromV' to automatically 
#trim rows and columns in those matrices that correspond to missing observations (i.e., participants #7 and #77).
#However, that backend  matrix-resizing process carries a performance cost.

#The GREML fitfunction tells OpenMx that the derivative of 'V' with respect to free parameter 
# 'va'(the additive-genetic variance) is a matrix named 'A', and that the derivative of 'V' w/r/t free parameter
# 've' is a matrix named 'I'.  At runtime, the GREML fitfunction will use these derivatives to help with 
#optimization and to compute standard errors:
gff <- mxFitFunctionGREML(dV=c(va="A",ve="I"))

#This is a custom compute plan.  It is necessary because we want to use the Newton-Raphson optimizer, which
#can use analytic first and second derivatives of the GREML fitfunction to speed up convergence:
plan <- mxComputeSequence(
	#List of steps.  The first is the most notable.  It tells OpenMx to use the Newton-Raphson optimizer, and to 
	#display what it's doing in real time (via the verbose argument):
	steps=list(
		mxComputeNewtonRaphson(verbose=5L),
		mxComputeOnce("fitfunction", c("fit","gradient","hessian")),
		mxComputeStandardError(),
		mxComputeHessianQuality(),
		mxComputeReportDeriv(),
		mxComputeReportExpectation()
))
#^^^Note:  If you are running the R GUI under Windows, delete the 'verbose=5L' argument in the above.


# Create the MxModel, and run it: ###############################################################################
#(We will create a lot of objects inside the mxModel() statement.  This helps to save memory.)

testmod <- mxModel(
	"GREML_1GRM_1trait", #<--Model name
	#1x1 matrix containing nonshared-environmental (residual) variance component:
	mxMatrix(type = "Full", nrow = 1, ncol=1, free=T, values = var(gremldat$yX[,"y"])/2, labels = "ve", 
					 lbound = 0.0001, name = "Ve"),
	#1x1 matrix containing additive-genetic variance component:
	mxMatrix(type = "Full", nrow = 1, ncol=1, free=T, values = var(gremldat$yX[,"y"])/2, labels = "va", 
					 lbound=0, name = "Va"), #<--Note the lower bound on zero.
	#998x998 identity matrix--the "relatedness matrix" for the residuals:
	mxMatrix("Iden",nrow=Np,name="I"),
	#The GRM:
	mxMatrix("Symm",nrow=Np,free=F,values=GRM,name="A"),
	#The model-expected covariance matrix:
	mxAlgebra((A%x%Va) + (I%x%Ve), name="V"),
	#An MxAlgebra for the heritability:
	mxAlgebra(Va/(Va+Ve), name="h2"),
	mxdat, #<--MxData object
	ge, #<--GREML expectation
	gff, #<--GREML fitfunction
	plan #<--Custom compute plan
)

#Run and view summary:
testrun <- mxRun(testmod)
summary(testrun)
mxEval(h2,testrun,T)

#If we had added the appropriate additional step to the compute plan, we could have requested 
#confidence intervals for va, ve, and/or h2 (but not for the regression coefficients).
#We have asymptotic-ML-theory standard errors for va and ve.  It is possible to obtain an SE for h2
#using a delta-method approximation, though its accuracy is quesionable given the fairly small N:
scm <- chol2inv(chol(testrun$output$hessian/2)) #<--Sampling covariance matrix for ve and va
pointest <- testrun$output$estimate #<--Point estimates of ve and va
h2se <- sqrt(
	(pointest[2]/(pointest[1]+pointest[2]))^2 * (
		(scm[2,2]/pointest[2]^2) - (2*scm[1,2]/pointest[1]/(pointest[1]+pointest[2])) + 
			(sum(scm)*(pointest[1]+pointest[2])^-2)
	))
print(h2se)

#It's easy to test the null hypothesis of zero h2.  If there is no genetic variance, then our model reduces to
#an OLS regression of the phenotype onto the covariate:
lmod <- lm(y~x,data=as.data.frame(gremldat$yX))
#Obtain a likelihood-ratio test statistic:
LRT <- as.numeric(-2*logLik(lmod) - testrun$fitfunction$MLfit)
#(N.B. this is of somewhat dubious statistical validity, since we are using the MxModel's ML fit, but
#what was optimized was its REML fit.)

#Recall that we set a lower bound of zero on the genetic variance. Zero is also the value of the genetic variance
#under the null hypothesis.  We are effectively doing a one-sided hypothesis test.  Under the null hypothesis,
#the LRT follows a mixture distribution, the 0.95 quantile of which is about 2.707:
LRT > 2.707 #<--Significant, at significance level 0.05.
1-(0.5+0.5*pchisq(LRT,1)) #<--One-sided p-value.