#This script (by Rob Kirkpatrick) demonstrates the use of the GREML feature in a factor model. There are three #phenotypes and 1 common factor; the only heritable variance is common variance (i.e., in the common factor). #It first loads a genomic-relatedness matrix (GRM) and a file containing the phenotype and a covariate. Then, it #fits a simple GREML model to estimate additive-genetic variance, residual variance, and heritability. #The GRM is 500x500, and was computed from 50,000 simulated SNPs in linkage equilibrium. require(OpenMx) options(mxCondenseMatrixSlots=TRUE) #<--Saves memory mxOption(NULL,"Default optimizer","NPSOL") mxOption(NULL,"Analytic Gradients","Yes") #You need to set R's working directory to the directory containing the data files for this demo. #(i.e., YOU MUST CHANGE THE NEXT LINE TO REFLECT WHERE, ON YOUR COMPUTER, YOU'VE PLACED THE DATA FILES): setwd("/home/kirkpatrick/2018/mxGREML_R_scripts/") N <- 500 #<--Total number of participants. #^^^We're using a small sample size for the sake of making the MxModel run quickly. 500 individuals is too small #to have any real power with a model like this one. # Load and check data: ################################################################## #This is a function from GCTA's documentation that loads a binary-format GRM into R: ReadGRMBin <- function(prefix, AllN=F, size=4){ sum_i=function(i){ return(sum(1:i)) } BinFileName=paste(prefix,".grm.bin",sep="") NFileName=paste(prefix,".grm.N.bin",sep="") IDFileName=paste(prefix,".grm.id",sep="") id = read.table(IDFileName) n=dim(id)[1] BinFile=file(BinFileName, "rb"); grm=readBin(BinFile, n=n*(n+1)/2, what=numeric(0), size=size) NFile=file(NFileName, "rb"); if(AllN==T){ N=readBin(NFile, n=n*(n+1)/2, what=numeric(0), size=size) } else N=readBin(NFile, n=1, what=numeric(0), size=size) i=sapply(1:n, sum_i) return(list(diag=grm[i], off=grm[-i], id=id, N=N)) } #Load the GRM. Argument 'prefix' to ReadGRMBin() should be everything in the filename and path of one of the GRM's #that precedes the ".grm" : grmstuff <- ReadGRMBin(prefix="Boulder2018FactorModGRM") closeAllConnections() str(grmstuff) #^^^grmstuff will be a list of 4 elements: # $diag is a numeric vector containing the GRM's diagonal elements. # $off is a numeric vector containing the GRM's off-diagonal elements. # $id is a dataframe that contains the family and individual IDs orresponding to the rows and columns of the GRM. # $N is the number of markers used to compute the GRM. #Use the elements of grmstuff to make a proper GRM: #Initialize: GRM <- matrix( 0.0,nrow=N,ncol=N, dimnames=list(grmstuff$id$V1+grmstuff$id$V2,grmstuff$id$V1+grmstuff$id$V2)) #^^^We're giving the GRM dimnames based on IDs. #Populate the upper triangle of the GRM, excluding diagonal, with grmstuff$off: GRM[!lower.tri(GRM,diag=T)] <- grmstuff$off #Populate the lower triangle, excluding diagonal, of the GRM: GRM <- GRM + t(GRM) #Populate the diagonal of the GRM: diag(GRM) <- grmstuff$diag #Look at upper left corner of GRM: GRM[1:5,1:5] #Load phenotype file: phenfile <- read.csv("GREMLFactorModData.csv",header=T) #The columns are family ID, individual ID, phenotype #1, phenotype #2, phenotype #3, a covariate relevant to all #3 phenotypes ('x0'), a covariate relevant to phenotype #1, a covariate relevant to phenotype #2, and a covariate #relevant to phenotype #3: head(phenfile) #There is one missing value on each phenotype: colSums(is.na(phenfile)) #Give phenfile rownames based on IDs: rownames(phenfile) <- phenfile$famid + phenfile$id #Check to make sure that there are no individuals represented in the GRM but not in the phenotype file (there won't #be, since the data are simulated, but this is a good check to make when working with real data): all(rownames(GRM) %in% rownames(phenfile)) #<--Should return TRUE. #And check vice versa: all(rownames(phenfile) %in% rownames(GRM)) #<--Should also be TRUE. #In fact, the rows of phenfile are matched with those of the GRM: all(rownames(phenfile)==rownames(GRM)) #<--TRUE. # Use mxGREMLDataHandler(): ################################################ #We give the data-handler our dataset, tell it which columns are the phenotype, and tell it which covariates go #with each phenotype: gremldat <- mxGREMLDataHandler( data=phenfile,yvars=c("y1","y2","y3"), Xvars=list(c("x0","x1"),c("x0","x2"),c("x0","x3")), addOnes=T,blockByPheno=T,staggerZeroes=T #<--FYI, these are the defaults for these arguments. ) #We can see that three datapoints have been dropped from the data, due to missing data: str(gremldat) #This is what the processed dataset looks like. In the column names of matrix X, the part before the underscore #identifies the phenotype being regressed against the covariate, and the part after the underscore identifies #the covariate ('1' just refers to the constant, for the regression intercept): head(gremldat$yX) #Here we can see where phenotype #1 stops and phenotype #2 starts in y: gremldat$yX[497:502,] #Here we can see where phenotype #2 stops and phenotype #3 starts in y: gremldat$yX[996:1001,] casesToDrop <- gremldat$casesToDrop #Obtain OLS residual variance from a regression of y onto X (for use with start values): orv <- var(lm(gremldat$yX[,1] ~ gremldat$yX[,-1])$residual) # Create MxData object, expectation, fitfunction, and compute plan: ########################################## #The MxData object. We will use gremldat$yX as the raw dataset to be used in our MxModel: mxdat <- mxData(observed=gremldat$yX,type="raw",sort=FALSE) #^^^N.B. the 'sort=FALSE' argument! #We tell the GREML espectation that the name of the model-expected covariance matrix is "V", and that the dataset #being input is y horizontally adhered to X (and therefore, it should not call the data-handler at runtime): ge <- mxExpectationGREML(V="V",dataset.is.yX=TRUE,casesToDropFromV=casesToDrop) #We tell the GREML fitfunction the names of the first partial derivatives of 'V' w/r/t each free parameter: gff <- mxFitFunctionGREML(dV=c(l1="dV_dl1",l2="dV_dl2",l3="dV_dl3",va="dV_dva",vu1="dV_dvu1", vu2="dV_dvu2",vu3="dV_dvu3")) #Custom compute plan, to use Newton-Raphson and see what the optimizer is doing in real time (via the # 'verbose' argument): plan <- mxComputeSequence( steps=list( mxComputeNewtonRaphson(verbose=5L), mxComputeOnce('fitfunction', c('fit','gradient','hessian')), mxComputeStandardError(), mxComputeHessianQuality(), mxComputeReportDeriv(), mxComputeReportExpectation() )) #^^^Note: If you are running the R GUI under Windows, delete the 'verbose=5L' argument in the above. # Create the MxModel: ################################################################################ #(We will create a lot of objects inside the mxModel() statement. This helps to save memory.) factorMod <- mxModel( "GREMLfactordemo", mxdat, ge, gff, plan, #This will be our matrix of factor loadings: mxMatrix(type="Full",nrow=3,ncol=1,free=T,values=sqrt(orv/2), labels=c("l1","l2","l3"), name="Lambda"), #1x1 matrix to hold heritability of the common factor. Note that, to identify the model, we restrict this #parameter to the interval [0,1]: mxMatrix(type="Full",nrow=1,ncol=1,free=T,values=0.5,labels="va",lbound=0,ubound=1,name="Va"), #Column vector containing unique variances. To set start values, we guess that (conditional on X) about half the #variance in each phenotype is unique: mxMatrix(type="Full",nrow=3*N,ncol=1,free=T,values=orv/2, labels=c(rep("vu1",N),rep("vu2",N),rep("vu3",N)),lbound=0.0001,name="Vu"), #The GRM: mxMatrix(type="Symm",nrow=N,free=F,values=GRM,name="A"), #NxN matrix of zeroes: mxMatrix(type="Zero",nrow=N,ncol=N,name="Zip"), #NxN identity matrix: mxMatrix(type="Iden",nrow=N,name="I"), #Covariance matrix of the common factor: mxAlgebra( (A%x%Va) + (I%x%(1-Va)), name="SigmaFac"), #Outer product of Lambda with itself: mxAlgebra( Lambda%*%t(Lambda), name="LamLamT"), #The model-expected covariance matrix, per the Fundamental Theorem of Factor Analysis: mxAlgebra(LamLamT%x%SigmaFac + vec2diag(Vu), name="V"), #First partial derivative of V w/r/t va: mxAlgebra(LamLamT%x%A, name="dV_dva"), #First partial derivative of V w/r/t vu1: mxAlgebra(rbind( cbind(I,Zip,Zip), cbind(Zip,Zip,Zip), cbind(Zip,Zip,Zip) ), name="dV_dvu1"), #First partial derivative of V w/r/t vu2: mxAlgebra(rbind( cbind(Zip,Zip,Zip), cbind(Zip,I,Zip), cbind(Zip,Zip,Zip) ), name="dV_dvu2"), #First partial derivative of V w/r/t vu3: mxAlgebra(rbind( cbind(Zip,Zip,Zip), cbind(Zip,Zip,Zip), cbind(Zip,Zip,I) ), name="dV_dvu3"), #First partial derivative of LamLamT w/r/t l1: mxAlgebra(rbind( cbind(2*Lambda[1,1],Lambda[2,1],Lambda[3,1]), cbind(Lambda[2,1],0,0), cbind(Lambda[3,1],0,0)), name="dLamLamT_dl1"), #First partial derivative of V w/r/t l1: mxAlgebra(dLamLamT_dl1%x%SigmaFac, name="dV_dl1"), #First partial derivative of LamLamT w/r/t l2: mxAlgebra(rbind( cbind(0,Lambda[1,1],0), cbind(Lambda[1,1],2*Lambda[2,1],Lambda[3,1]), cbind(0,Lambda[3,1],0)), name="dLamLamT_dl2"), #First partial derivative of V w/r/t l2: mxAlgebra(dLamLamT_dl2%x%SigmaFac, name="dV_dl2"), #First partial derivative of LamLamT w/r/t l3: mxAlgebra(rbind( cbind(0,0,Lambda[1,1]), cbind(0,0,Lambda[2,1]), cbind(Lambda[1,1],Lambda[2,1],2*Lambda[3,1])), name="dLamLamT_dl3"), #First partial derivative of V w/r/t l3: mxAlgebra(dLamLamT_dl3%x%SigmaFac, name="dV_dl3") ) #Once the MxModel object has been created, we can delete objects in R's workspace which have been copied into the #MxModel, or which we simply don't need anymore. This saves memory: rm(GRM,ge,gff,gremldat,mxdat,grmstuff,plan) #This tells R to do "garbage collection"--it may be able to free up some memory it no longer needs: gc() #Run and view summary: factorRun <- mxRun(factorMod) summary(factorRun) # Conditional re-run, using NPSOL: ####################################################################### #This section of code should be run if the initial mxRun() call results in a status code 4 ("blue"), #or 5 or 6 ("red"). What it does is re-run the MxModel, but with a different optimizer, NPSOL, and exploiting #as much as possible NPSOL's ability to use analytic fitfunction derivatives: if(factorRun$output$status$code > 1){ #This is what NPSOL's developers call a "warm start." It's the upper-triangular Cholesky factor of the Hessian #matrix, evaluated at the start values. We are going to re-run the model again, starting at the values #Newton-Raphson reached, and we therefore have the Hessian matrix at the initial values. The reason the warm #start is useful that it tells NPSOL the curvature of the fitfunction surface at the initial values, which saves #NPSOL the trouble of figuring that out numerically, which in turn cuts down on the number of necessary #fitfunction evaluations: ws <- chol(factorRun$output$hessian) #We will stick a new compute plan in place of the old. Notice that we're giving NPSOL the warm start, telling it #to use the analytic gradient, and also relaxing its optimality tolerance. All of this will help it reach a #solution more quickly: factorRun$compute <- mxComputeSequence( freeSet = c("Va","Lambda","Vu"), steps=list( mxComputeGradientDescent(engine = "NPSOL", useGradient = T, verbose=5, tolerance=1e-7, warmStart = ws), mxComputeOnce('fitfunction', c('fit','gradient','hessian','ihessian')), mxComputeStandardError(), mxComputeReportDeriv(), mxComputeReportExpectation() )) #Re-run and see summary: factorRun <- mxRun(factorRun) summary(factorRun) } #^^^Note: If you need to use MxConstraint objects in a GREML model, you will not be able to use Newton-Raphson. #In such a case, use NPSOL or SLSQP, with useGradient=T. NPSOL/SLSQP will use the analytic first derivatives of the #fitfunction w/r/t free parameters, and deal with the constraint functions numerically.