# ----------------------------------------------------------------------- # Program: NTFD ADSE model. No Vertical Transmission but includes Assortative Mating # Author: Matt Keller # Date: May 30, 2022 #NOTE: Please turn on "soft wrapping" in your RStudio Server session so that you can easily read the comments. To do this, in RStudio Server, go to "Tools" -> "Global Options" -> "Code" and then check "Soft-wrap R source files". #install OpenMx if not already done - two ways to do it #install.packages("OpenMx") #if you don't need NPSOL (but we do!) #source('https://vipbg.vcu.edu/vipbg/OpenMx2/software/getOpenMx.R') #if you need NPSOL - note that this won't work if you run R version 4.0 or higher, in which case, you need to have an older version of R on your machine. See, e.g., here for how to do that: https://jacobrprice.github.io/2019/09/19/Installing-multiple-parallel-R-versions.html require(OpenMx) #NOTE: OpenMx uses CSOLNP optimizer by default, but this doesn't work with nonlinear constraints (which we have below). We need NPSOL, but unfortunately this can be a PITA to get installed (see above). #For more on NPSOL vs. CSOLNP, see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4516707/ #Change default optimizer mxOption(NULL,"Default optimizer","NPSOL") #NPSOL optimizer - crucial for fitting non-linear constraints #Check the options we have now for using openmx options()$mxOptions$'Number of Threads' options()$mxOptions$'Default optimizer' #Let's go ahead and read in a script I wrote that contains handy functions dimx() info(), LS(), and look(). source() simply reads in another R script and runs every line of its code. Here, that code contains the syntax for the four functions I wrote. No big deal if you don't want to (or can't) install them. These functions are just for convenience; the script runs fine without them (though you'll get an error when calling info() below) source("http://www.matthewckeller.com/R.Class/KellerScript2.R") # ----------------------------------------------------------------------- ####################################################### # ----------------------------------------------------------------------- # START Section A: Fit ACE CTD Model on Dataset 1 (including only the twins) # ----------------------------------------------------------------------- #Get the data we will be using and set it up # ----------------------------------------------------------------------- #Download the simulated NTFD dataset 1 from my website con <- "http://www.matthewckeller.com/public/ntf1.data" ntf1.data <- read.table(con,header=TRUE,na.strings=-999) #note that -999 means "NA" in this dataset; na.strings=-999 has R change those to NAs nv <- 1 #Number of variables; this allows the same script to be used for multivariate models #Look at Data info(ntf1.data) #won't run if you haven't installed my user-defined functions above head(ntf1.data) str(ntf1.data) #Get summary of each column in ntf1.data & look at 4 histograms summary(ntf1.data) hist(ntf1.data$tw1) #data looks normal - normality is more important in ML than in OLS hist(ntf1.data$tw2) hist(ntf1.data$father) hist(ntf1.data$mother) #Say we had NOT collected data on parents. Create the data we'd use in a Classical Twin Design mz.ctd1 <- ntf1.data[ntf1.data$twin.type=='MZ',3:4] #grabs rows where twin.type=="MZ" and columns 3 to 4 dz.ctd1 <- ntf1.data[ntf1.data$twin.type=='DZ',3:4] #grabs rows where twin.type=="DZ" and columns 3 to 4 #Summaries of data summary(mz.ctd1) summary(dz.ctd1) #Look at covariances (CVMat.mz <- round(cov(mz.ctd1,use='pairwise'),3)) (CVMat.dz <- round(cov(dz.ctd1,use='pairwise'),3)) (CVmz <- CVMat.mz[1,2]) (CVdz <- CVMat.dz[1,2]) (Vp <- (sum(diag(CVMat.mz)) + sum(diag(CVMat.dz)))/4) #ctd1.data CTD ACE Model Method of Moments (MoM) estimates (MoM.VA.hat <- 2*(CVmz - CVdz) ) (MoM.VC.hat <- 2*CVdz - CVmz ) (MoM.VE.hat <- Vp - CVmz) # ----------------------------------------------------------------------- #Set up start values and set whether the parameters are free or fixed. #NOTE: For identified models, start values should not affect the final best fit. We are fitting an ACE model (assuming VD=0), thus we should have an identified model # ----------------------------------------------------------------------- (VA.st.A <- runif(1,.4,.8)) (VA.est.A <- TRUE) #TRUE" means we allow VA to be estimated freely (VD.st.A <- 0) (VD.est.A <- FALSE) #"FALSE" means we fix VD to it's start value (0) (VC.st.A <- runif(1,.2,.6)) (VC.est.A <- TRUE) #TRUE" means we allow VC to be estimated freely # ----------------------------------------------------------------------- #Set up the ACE CTD # ----------------------------------------------------------------------- #NOTE: we use the path coefficient model approach for ETFDs below, and thus for consistency, I do so as well for the CTD estimates. We can't (at least I don't know how to) use the direct symmetric approach, where we directly estimate the variances, when there are non-linear constraints #NOTE: Notice the use of the 6 "wildcard" variables created in the section above in the mxMatrix functions below. Wildcards are useful for many reasons in R scripts. Here, they make it easy to see what parameters are changing and easy to change them if we want. Even though they make your script longer, they make the script easier to read and decrease the chance of user error a <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VA.est.A,values=VA.st.A,label="AddGenPath",name="a") d <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VD.est.A,values=VD.st.A,label="DomPath",name="d") c <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VC.est.A,values=VC.st.A,label="SibPath", name="c") e <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=T,values=3,label="EnvPath",name="e") # mxAlgebra - Expected Variances and Relative Covariances VP <- mxAlgebra(a %*% a + e %*% t(e) + d %*% t(d) + c %*% t(c),name="VP") CvMz <- mxAlgebra(a %*% t(a) + d %*% t(d) + c %*% t(c),name="CvMz") #MZ CvDz <- mxAlgebra(.50 %x% a %*% t(a) + .25 %x% d %*% t(d) + c %*% t(c),name="CvDz") #DZ #Put these relative covariances together into MZ relatives and DZ relatives matrices MZ.cv <- mxAlgebra(rbind( cbind(VP, CvMz), cbind(CvMz, VP)), dimnames=list(colnames(mz.ctd1),colnames(mz.ctd1)),name="expCovMZ") DZ.cv <- mxAlgebra(rbind( cbind(VP, CvDz), cbind(CvDz, VP)), dimnames=list(colnames(dz.ctd1),colnames(dz.ctd1)),name="expCovDZ") #Put the relatives means together into means matrix (same for MZ and DZ rels) Both.means <- mxMatrix(type="Full", nrow=1, ncol=2*nv, free=TRUE, values= .01, label="mean",dimnames=list(NULL, colnames(mz.ctd1)), name="expMeanBoth") #Put in the data dataMZ1 <- mxData(observed=mz.ctd1, type="raw") dataDZ1 <- mxData(observed=dz.ctd1, type="raw") #Objectives for two groups objMZ <- mxExpectationNormal(covariance="expCovMZ",means="expMeanBoth", dimnames=c('tw1','tw2')) objDZ <- mxExpectationNormal(covariance="expCovDZ",means="expMeanBoth", dimnames=c('tw1','tw2')) #Combine groups myfitfun <- mxFitFunctionML() paramsA <- list(a,d,e,c,VP,CvMz,CvDz,Both.means,MZ.cv,DZ.cv,myfitfun) modelMZ1 <- mxModel("MZctd1",paramsA,dataMZ1,objMZ) modelDZ1 <- mxModel("DZctd1",paramsA,dataDZ1,objDZ) obj <- mxFitFunctionMultigroup(c("MZctd1","DZctd1")) ACE.ctd1.Model <- mxModel("ctd1ACE",modelMZ1,modelDZ1,obj) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model #NOTE: Code "GREEN" is OK. Code "RED" usually means there's a problem ACE.ctd1.Fit <- mxRun(ACE.ctd1.Model,intervals=FALSE) (ACE.ctd1.Fit.Summary <- summary(ACE.ctd1.Fit)) #Is our model identified according to openMx? mxCheckIdentification(ACE.ctd1.Fit) #-2ll - ask your neighbors/breakout colleagues what theirs was ACE.ctd1.Fit.Summary$Minus2LogLikelihood #Estimates - ask your neighbors/breakout colleagues what theirs were VA.st.A^2 #VA start mxEval(a%*%t(a), ACE.ctd1.Fit$MZctd1) #VA VD.st.A^2 #VD start mxEval(d%*%t(d), ACE.ctd1.Fit$MZctd1) #VD (fixed) VC.st.A^2 #VC start mxEval(c%*%t(c), ACE.ctd1.Fit$MZctd1) #VC #NOTE: Did your start values influence your final estimates? (If you don't have colleagues to compare these to, just rerun the entire Section A, which will generate new start values). How did your Maximum Likelihood estimates from openMx compare to the estimates you derived algebraically? # ----------------------------------------------------------------------- # ----------------------------------------------------------------------- # END Section A: Fit ACE CTD Model on Dataset 1 (including only the twins) # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- # START Section B: Fit ACE CTD "SENSITIVITY" Model on Dataset 1 where VD is fixed to .05 # ----------------------------------------------------------------------- #NOTE: Let's now do something we would not normally do: Fit an ACE model to data but assume that VD is non-zero. This is called a "sensitivity analysis": what would our CTD results be if our assumption that VD=0 is wrong? Here, we'll see what would happen if VD were actually .05 #NOTE: To do this, we don't have to rewrite the whole model above; we can just use omxSetParameters to change the parameters of interest (here, d) # ----------------------------------------------------------------------- (VA.st.B <- runif(1,.4,.8)) (VA.est.B <- TRUE) #TRUE" means we allow VA to be estimated freely (VD.st.B <- sqrt(.05)) (VD.est.B <- FALSE) #"FALSE" means we fix VD to it's start value (.05) (VC.st.B <- runif(1,.2,.6)) (VC.est.B <- TRUE) #TRUE" means we allow VC to be estimated freely ACE.ctd1.d05.Model <- omxSetParameters(ACE.ctd1.Model,"DomPath",free=VD.est.B,values=VD.st.B) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model ACE.ctd1.d05.Fit <- mxRun(ACE.ctd1.d05.Model,intervals=FALSE) (ACE.ctd1.d05.Fit.Summary <- summary(ACE.ctd1.d05.Fit)) #Is our model identified according to openMx? mxCheckIdentification(ACE.ctd1.d05.Fit) #-2ll - ask your neighbors/breakout colleagues what theirs was ACE.ctd1.d05.Fit.Summary$Minus2LogLikelihood #Estimates - ask your neighbors/breakout colleagues what theirs were VA.st.B^2 #VA start mxEval(a%*%t(a), ACE.ctd1.d05.Fit$MZctd1) #VA VD.st.B^2 #VD start mxEval(d%*%t(d), ACE.ctd1.d05.Fit$MZctd1) #VD (fixed) VC.st.B^2 #VC start mxEval(c%*%t(c), ACE.ctd1.d05.Fit$MZctd1) #VC #NOTE: Why did your estimates of VA and VC change? # ----------------------------------------------------------------------- # ----------------------------------------------------------------------- # END Section B: Fit ACE CTD "SENSITIVITY" Model on Dataset 1 where VD is fixed to .05 # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- # START Section C: Fit ACE CTD "SENSITIVITY" Model on Dataset 1 where VD is fixed to .12 # ----------------------------------------------------------------------- #NOTE: Again, what would our CTD results be if our assumption that VD=0 is wrong? Here, we'll see what would happen if VD were actually .12 #Now use omxSetParameters to change the parameters of interest (here, d) # ----------------------------------------------------------------------- (VA.st.C <- runif(1,.4,.8)) (VA.est.C <- TRUE) #TRUE" means we allow VA to be estimated freely (VD.st.C <- sqrt(.12)) (VD.est.C <- FALSE) #"FALSE" means we fix VD to it's start value (0) (VC.st.C <- runif(1,.2,.6)) (VC.est.C <- TRUE) #TRUE" means we allow VC to be estimated freely ACE.ctd1.d12.Model <- omxSetParameters(ACE.ctd1.Model,"DomPath",free=VD.est.C,values=VD.st.C) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model (NOTE: Code "GREEN" is OK. Code "RED" usually means there's a problem) ACE.ctd1.d12.Fit <- mxRun(ACE.ctd1.d12.Model,intervals=FALSE) (ACE.ctd1.d12.Fit.Summary <- summary(ACE.ctd1.d12.Fit)) #Is our model identified according to openMx? mxCheckIdentification(ACE.ctd1.d12.Fit) #-2ll - ask your neighbors/breakout colleagues what theirs was ACE.ctd1.d12.Fit.Summary$Minus2LogLikelihood #Estimates - as your neighbors/breakout colleagues what theirs were VA.st.C^2 #VA start mxEval(a%*%t(a), ACE.ctd1.d12.Fit$MZctd1) #VA VD.st.C^2 #VD start mxEval(d%*%t(d), ACE.ctd1.d12.Fit$MZctd1) #VD (fixed) VC.st.C^2 #VC start mxEval(c%*%t(c), ACE.ctd1.d12.Fit$MZctd1) #VC #NOTE: If VD were actually .12, how biased would your estimates of VA and VC have been in the usual CTD where we assume (fix) VD = 0? # ----------------------------------------------------------------------- # ----------------------------------------------------------------------- # END Section C: Fit ACE CTD "SENSITIVITY" Model on Dataset 1 where VD is fixed to .12 # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- # START Section D: Fit ADCE CTD Model on Dataset 1 (including only the twins) # ----------------------------------------------------------------------- #NOTE: Now let's now do something we REALLY would not normally do: Fit an non-identified ADCE model in a CTD. To do this, we will allow VA, VD, AND VC to be estimated freely #Use omxSetParameters to change the parameters of interest (here, a, d, and c) # ----------------------------------------------------------------------- (VA.st.D <- runif(1,.4,.8)) (VA.est.D <- TRUE) #TRUE" means we allow VA to be estimated freely (VD.st.D <- runif(1,.2,.6)) (VD.est.D <- TRUE) #TRUE" means we allow VD to be estimated freely (VC.st.D <- runif(1,.2,.6)) (VC.est.D <- TRUE) #TRUE" means we allow VC to be estimated freely ADCE.ctd1.Model <- omxSetParameters(ACE.ctd1.Model,c("AddGenPath","DomPath","SibPath"),free=c(VA.est.D,VD.est.D,VC.est.D),values=c(VA.st.D,VD.st.D,VC.st.D)) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model ADCE.ctd1.Fit <- mxRun(ADCE.ctd1.Model,intervals=FALSE) (ADCE.ctd1.Fit.Summary <- summary(ADCE.ctd1.Fit)) #Is our model identified according to openMx? mxCheckIdentification(ADCE.ctd1.Fit) #-2ll - ask your neighbors/breakout colleagues what theirs was ADCE.ctd1.Fit.Summary$Minus2LogLikelihood #Estimates & start values- ask your neighbors/breakout colleagues what theirs were VA.st.D^2 #VA start mxEval(a%*%t(a), ADCE.ctd1.Fit$MZctd1) #VA VD.st.D^2 #VD start mxEval(d%*%t(d), ADCE.ctd1.Fit$MZctd1) #VD VC.st.D^2 #VC start mxEval(c%*%t(c), ADCE.ctd1.Fit$MZctd1) #VC #NOTE: Did you start values influence your final estimates? Why was the -2LL the same as before? You can rerun Section C a few time to see what happens to your estimates. # ----------------------------------------------------------------------- # ----------------------------------------------------------------------- # END Section D: Fit ADCE CTD Model on Dataset 1 (including only the twins) # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- # START Section E: Fit ADSE + AM Nuclear Twin Family Design (NTFD) Model on Dataset 1 (including both parents) # ----------------------------------------------------------------------- #NOTE: We are now going to learn how to fit a NTFD model. This model accounts for the possibility of assortative mating (AM). However, it does not model vertical transmission (the variance of which is VF). One cannot estimate both siblings environmental effects (S) and familial environmental effects arising from vertical transmission (F) simultaneously in a NTFD. However, Stealth and Cascade models have sufficient information to do this. #NOTE: In an ADSE NTFD, "S" (the "sibling environmental effect") is akin to "C" in a Classical Twin Design. We call it "S" in NTFDs to distinguish it from "F" ("familial environmental effect") that arises from vertical transmission. C (in CTD) = S + F (in NTFD). So we could have called it "C" here because we're not fitting an F component, but for consistency with other NTFDs, we keep the "S" nomenclature. #Get the data we will be using and set it up # ----------------------------------------------------------------------- #NOTE: Here, we will use the SAME data as in Section A, but we will now include both parents per family mz.ntf1 <- ntf1.data[ntf1.data$twin.type=='MZ',3:6] #grabs rows where twin.type=="MZ" and columns 3 to 6 dz.ntf1 <- ntf1.data[ntf1.data$twin.type=='DZ',3:6] #grabs rows where twin.type=="DZ" and columns 3 to 6 #Summaries of data summary(mz.ntf1) summary(dz.ntf1) #Look at covariances (mz.fam.cv <- round(cov(mz.ntf1,use='pairwise'),3)) (dz.fam.cv <- round(cov(dz.ntf1,use='pairwise'),3)) # ----------------------------------------------------------------------- #Let's now fit an ADSE model but including parental information, so we should have an identified NTFD model # ----------------------------------------------------------------------- (VA.st.E <- runif(1,.4,.8)) (VA.est.E <- TRUE) #TRUE" means we allow VA to be estimated freely (VD.st.E <- runif(1,.2,.6)) (VD.est.E <- TRUE) #TRUE" means we allow VD to be estimated freely (VS.st.E <- runif(1,.2,.6)) (VS.est.E <- TRUE) #TRUE" means we allow VS to be estimated freely # ----------------------------------------------------------------------- #Set up the ADSE NTFD # ----------------------------------------------------------------------- # NOTE: we use the path coefficient model approach for ETFDs. We can't (at least I don't know how to) use the direct symmetric approach, where we directly estimate the variances, when there are non-linear constraints a <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VA.est.E,values=VA.st.E,label="AddGenPath",name="a") d <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VD.est.E,values=VD.st.E,label="DomPath",name="d") s <- mxMatrix(type="Lower",nrow=nv, ncol=nv,free=VS.est.E,values=VS.st.E,label="SibPath", name="s") e <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=T,values=3,label="EnvPath",name="e") mu <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=T,values=.0,label="AMCopath",name="mu") #Defining q1 and q2 & equating nonlinear constraint for q (the variance of latent variable A). #NOTE: This is how we fit nonlinear constraints in openMx. You have to first define a parameter (here q1) that is "freely" estimated. This gives the value of q a starting place in all the other parameters that depend on q (delta, VP, and all the covariances). You then define another parameter (here q2) that is a function of other parameters using mxAlgebra. Finally, you set the two equal to one another using mxConstraint. Thus, openMx starts q at the start values of q1 (here, 1), but before ending that iteration, forces q1 to equal q2. It then iterates again and again until convergence. #NOTE: q is a non-linear constraint because q is a function of itself (i.e., it is a function of delta, and delta is a function of q). This type of circularity defines a common type of nonlinear constraints in ETFDs. #NOTE: Given that the circularity involves delta and q, we could have decided to make delta the non-linear constraint instead of q. The estimates would have been identical. #NOTE: Sometimes it's not easy to see which parameters should be considered nonlinear constrants and constrained using mxConstraint(). If in doubt, use trial & error with a simulated dataset in which you know the ground truth. q1 <- mxMatrix(type="Full",nrow=nv,ncol=nv,free=T,values=1,label="LatentVarAddGen",name="q1") q2 <- mxAlgebra(1 + delta %&% mu,name="q2") qCon <- mxConstraint(q1==q2,name='qCon') #mxAlgebra - Expected Variances and Relative Covariances #NOTE: When there is a non-linear constraint equating an mxMatrix object to an mxAlgebra object, you should set your mxAlgebras (below) to be functions of the mxMatrix (not mxAlgebra) object of that constraint so that openMx estimates can start somewhere. Thus, the algebra's below are functions of q1, not q2. delta <- mxAlgebra(q1 %*% a,name="delta") VP <- mxAlgebra(a %&% q1 + e %*% t(e) + d %*% t(d) + s %*% t(s),name="VP") CvMz <- mxAlgebra(a %&% q1 + d %*% t(d) + s %*% t(s),name="CvMz") #MZ CvDz <- mxAlgebra(a %&% (q1-.5) + .25 %x% d %*% t(d) + s %*% t(s),name="CvDz") #DZ CvPO <- mxAlgebra(.5 %x% a %*% delta + .5 %x% a %*% delta %*% mu %*% VP,name="CvPO") #Parent-Offspring CvSps <- mxAlgebra(VP %&% mu,name="CvSps") #Spouse #Put these relative covariances together into MZ relatives and DZ relatives matrices MZ.rel.cv <- mxAlgebra(rbind( cbind(VP, CvMz, CvPO, CvPO), cbind(CvMz, VP, CvPO, CvPO), cbind(CvPO, CvPO, VP, CvSps), cbind(CvPO, CvPO, CvSps, VP)), dimnames=list(colnames(mz.ntf1),colnames(mz.ntf1)),name="expCovMZRels") DZ.rel.cv <- mxAlgebra(rbind( cbind(VP, CvDz, CvPO, CvPO), cbind(CvDz, VP, CvPO, CvPO), cbind(CvPO, CvPO, VP, CvSps), cbind(CvPO, CvPO, CvSps, VP)), dimnames=list(colnames(dz.ntf1),colnames(dz.ntf1)),name="expCovDZRels") #Put the relatives means together into means matrix (same for MZ and DZ rels) Both.means <- mxMatrix(type="Full", nrow=1, ncol=4*nv, free=TRUE, values= .01, label="mean",dimnames=list(NULL, colnames(mz.ntf1)), name="expMeanBoth") #Put in the data dataMZRel1 <- mxData(observed=mz.ntf1, type="raw") dataDZRel1 <- mxData(observed=dz.ntf1, type="raw") #Objectives for two groups objMZRels <- mxExpectationNormal(covariance="expCovMZRels",means="expMeanBoth", dimnames=c('tw1','tw2','father','mother')) objDZRels <- mxExpectationNormal(covariance="expCovDZRels",means="expMeanBoth", dimnames=c('tw1','tw2','father','mother')) #Combine groups myfitfun <- mxFitFunctionML() paramsE <- list(a,d,e,mu,s,VP,q1,q2,delta,CvMz,CvDz,CvPO,CvSps,Both.means,MZ.rel.cv,DZ.rel.cv,myfitfun) modelMZ.ntf <- mxModel("MZntf1",paramsE,dataMZRel1,objMZRels,qCon) modelDZ.ntf <- mxModel("DZntf1",paramsE,dataDZRel1,objDZRels) objE <- mxFitFunctionMultigroup(c("MZntf1","DZntf1")) ADSE.ntf1.Model <- mxModel("ntf1ADSE",modelMZ.ntf,modelDZ.ntf,objE) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model ADSE.ntf1.Fit <- mxRun(ADSE.ntf1.Model,intervals=FALSE) (ADSE.ntf1.Fit.Summary <- summary(ADSE.ntf1.Fit)) #Is our model identified according to openMx? mxCheckIdentification(ADSE.ntf1.Fit) #-2ll - ask your neighbors/breakout colleagues what theirs was ADSE.ntf1.Fit.Summary$Minus2LogLikelihood #Estimates & start values- as your neighbors/breakout colleagues what theirs were VA.st.E^2 #VA start mxEval(a%*%q2%*%t(a), ADSE.ntf1.Fit$MZntf1) #VA VD.st.E^2 #VD start mxEval(d%*%t(d), ADSE.ntf1.Fit$MZntf1) #VD VS.st.E^2 #VS start mxEval(s%*%t(s), ADSE.ntf1.Fit$MZntf1) #VS mxEval(e%*%t(e), ADSE.ntf1.Fit$MZntf1) #VE ADSE.ntf1.Fit$MZntf1$CvSps$result #CV.Sps ADSE.ntf1.Fit$MZntf1$VP$result #VP #NOTE: Do you think our model is identified? Why? How do your estimates compare to those from the usual CTD estimates in Section A? Which do you believe more? Why? What source of bias did the NTFD take care of? What other issues might cause bias in these estimates? # ----------------------------------------------------------------------- # ----------------------------------------------------------------------- # END Section E: Fit ADSE + AM NTFD Model on Dataset 1 (including both parents) # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- # START Section F: Fit ACE CTD Model on Dataset 2 (including only the twins) # ----------------------------------------------------------------------- #NOTE: Here, we will get CTD estimates from data simulated under a different scenario from above. #Get the data (dataset 2) we will be using and set it up # ----------------------------------------------------------------------- #Download the simulated NTFD dataset 2 from the web con <- "http://www.matthewckeller.com/public/ntf2.data" ntf2.data <- read.table(con,header=TRUE,na.strings=-999) #note that -999 means "NA" in this dataset; na.strings=-999 has R change those to NAs nv <- 1 #Number of variables; this allows the same script to be used for multivariate models #Look at Data info(ntf2.data) #won't run if you haven't installed my user-defined functions above head(ntf2.data) str(ntf2.data) summary(ntf2.data) #Say we had NOT collected data on parents. Create the data we'd use in a Classical Twin Design mz.ctd2 <- ntf2.data[ntf2.data$twin.type=='MZ',3:4] #grabs rows where twin.type=="MZ" and columns 3 to 4 dz.ctd2 <- ntf2.data[ntf2.data$twin.type=='DZ',3:4] #grabs rows where twin.type=="DZ" and columns 3 to 4 #Look at covariances (CVMat.mz2 <- round(cov(mz.ctd2,use='pairwise'),3)) (CVMat.dz2 <- round(cov(dz.ctd2,use='pairwise'),3)) (CVmz2 <- CVMat.mz2[1,2]) (CVdz2 <- CVMat.dz2[1,2]) (Vp2 <- (sum(diag(CVMat.mz2)) + sum(diag(CVMat.dz2)))/4) #ctd1.data CTD ACE Model Method of Moments (MoM) estimates: (MoM.VA.hat2 <- 2*(CVmz2 - CVdz2) ) (MoM.VC.hat2 <- 2*CVdz2 - CVmz2 ) (MoM.VE.hat2 <- Vp2 - CVmz2) (MoM.h2.hat2 <- MoM.VA.hat2/Vp2) # ----------------------------------------------------------------------- #Set up the ACE CTD Model for Dataset 2 # ----------------------------------------------------------------------- #NOTE: We can save a lot of coding by just reusing most of the objects we already created up in the CTD model in Section A. If you have NOT yet run Section A, or have somehow over-written what you did in Section A, please rerun Section A. #Put in the NEW data (dataset 2) dataMZ2 <- mxData(observed=mz.ctd2, type="raw") dataDZ2 <- mxData(observed=dz.ctd2, type="raw") #Combine groups modelMZ2 <- mxModel("MZctd2",paramsA,dataMZ2,objMZ) modelDZ2 <- mxModel("DZctd2",paramsA,dataDZ2,objDZ) objF <- mxFitFunctionMultigroup(c("MZctd2","DZctd2")) ACE.ctd2.Model <- mxModel("ctd2ACE",modelMZ2,modelDZ2,objF) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model ACE.ctd2.Fit <- mxRun(ACE.ctd2.Model,intervals=FALSE) (ACE.ctd2.Fit.Summary <- summary(ACE.ctd2.Fit)) #Is our model identified according to openMx? mxCheckIdentification(ACE.ctd2.Fit) #-2ll - ask your neighbors/breakout colleagues what theirs was ACE.ctd2.Fit.Summary$Minus2LogLikelihood #Estimates - ask your neighbors/breakout colleagues what theirs were mxEval(a%*%t(a), ACE.ctd2.Fit$MZctd2) #VA mxEval(d%*%t(d), ACE.ctd2.Fit$MZctd2) #VD (fixed) mxEval(c%*%t(c), ACE.ctd2.Fit$MZctd2) #VC #NOTE: How did your Maximum Likelihood estimates from openMx compare to the estimates you derived algebraically? Do you have any guesses about what the true variance components are in the population? # ----------------------------------------------------------------------- # ----------------------------------------------------------------------- # END Section F: Fit ACE CTD Model on Dataset 2 (including only the twins) # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- #START Section G: Fit ADSE + AM Nuclear Twin Family Design (NTFD) Model on Dataset 2 (including both parents) # ----------------------------------------------------------------------- #NOTE: Here, we'll run another NTFD but on dataset 2 #Get the data we will be using and set it up # ----------------------------------------------------------------------- #Here, we will use the SAME data as in Section F, but we will now include both parents per family mz.ntf2 <- ntf2.data[ntf2.data$twin.type=='MZ',3:6] #grabs rows where twin.type=="MZ" and columns 3 to 6 dz.ntf2 <- ntf2.data[ntf2.data$twin.type=='DZ',3:6] #grabs rows where twin.type=="DZ" and columns 3 to 6 #Summaries of data summary(mz.ntf2) summary(dz.ntf2) #Look at covariances (mz.fam.cv2 <- round(cov(mz.ntf2,use='pairwise'),3)) (dz.fam.cv2 <- round(cov(dz.ntf2,use='pairwise'),3)) # ----------------------------------------------------------------------- #Set up the ADSE NTFD Model for Dataset 2 # ----------------------------------------------------------------------- #NOTE: We can save a lot of coding by just reusing most of the objects we already created up in the NTFD model already done in Section E. If you have NOT yet run Section E, or have somehow over-written what you did in Section E, please rerun Section E. #Put in the NEW data (dataset 2) dataMZ.ntf2 <- mxData(observed=mz.ntf2, type="raw") dataDZ.ntf2 <- mxData(observed=dz.ntf2, type="raw") #Combine groups modelMZ.ntf2 <- mxModel("MZntf2",paramsE,dataMZ.ntf2,objMZRels,qCon) modelDZ.ntf2 <- mxModel("DZntf2",paramsE,dataDZ.ntf2,objDZRels) objG <- mxFitFunctionMultigroup(c("MZntf2","DZntf2")) ADSE.ntf2.Model <- mxModel("ntf2ADSE",modelMZ.ntf2,modelDZ.ntf2,objG) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model ADSE.ntf2.Fit <- mxRun(ADSE.ntf2.Model,intervals=FALSE) (ADSE.ntf2.Fit.Summary <- summary(ADSE.ntf2.Fit)) #-2ll - ask your neighbors/breakout colleagues what theirs was ADSE.ntf2.Fit.Summary$Minus2LogLikelihood #Estimates - ask your neighbors/breakout colleagues what theirs were mxEval(a%*%q2%*%t(a), ADSE.ntf2.Fit$MZntf2) #VA mxEval(d%*%t(d), ADSE.ntf2.Fit$MZntf2) #VD mxEval(s%*%t(s), ADSE.ntf2.Fit$MZntf2) #VS mxEval(e%*%t(e), ADSE.ntf2.Fit$MZntf2) #VE ADSE.ntf2.Fit$MZntf2$CvSps$result #CV.Sps ADSE.ntf2.Fit$MZntf2$VP$result #VP #NOTE: How do your estimates compare to those from the usual CTD estimates in Section F? Which do you believe more? Why? What source of bias did the NTFD take care of? What other issues might cause bias in these estimates? # ----------------------------------------------------------------------- # ----------------------------------------------------------------------- # END Section G: Fit ADSE + AM NTFD Model on Dataset 2 (including both parents) # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- #START Section H: Total Results # ----------------------------------------------------------------------- #NOTE: This section will put all your results together in an easy to read matrix. Each column will be estimates from various models you ran and the rows are VA.est, VD.est and (VC.est or VS.est). The final column reveals the true parameters in the population, which we know because this was simulated data. #Create two matrices with all results in them RES.DATA1 <- data.frame( ACE.D0.CTD=c(mxEval(a%*%t(a), ACE.ctd1.Fit$MZctd1),mxEval(d%*%t(d), ACE.ctd1.Fit$MZctd1),mxEval(c%*%t(c), ACE.ctd1.Fit$MZctd1),NA), ACE.D05.CTD=c(mxEval(a%*%t(a), ACE.ctd1.d05.Fit$MZctd1),mxEval(d%*%t(d), ACE.ctd1.d05.Fit$MZctd1),mxEval(c%*%t(c), ACE.ctd1.d05.Fit$MZctd1),NA), ACE.D12.CTD=c(mxEval(a%*%t(a), ACE.ctd1.d12.Fit$MZctd1),mxEval(d%*%t(d), ACE.ctd1.d12.Fit$MZctd1),mxEval(c%*%t(c), ACE.ctd1.d12.Fit$MZctd1),NA), ACSE.CTD=c(mxEval(a%*%t(a), ADCE.ctd1.Fit$MZctd1),mxEval(d%*%t(d), ADCE.ctd1.Fit$MZctd1),mxEval(c%*%t(c), ADCE.ctd1.Fit$MZctd1),NA), ADSE.NTFD=c(mxEval(a%*%q2%*%t(a), ADSE.ntf1.Fit$MZntf1),mxEval(d%*%t(d), ADSE.ntf1.Fit$MZntf1),mxEval(s%*%t(s), ADSE.ntf1.Fit$MZntf1),ADSE.ntf1.Fit$MZntf1$CvSps$result), TRUTH=c(.41,.12,.06,0), row.names=c("VA.est","VD.est","VC or VS.est","CV.Spouse")) RES.DATA1 <- round(as.matrix(RES.DATA1),3) RES.DATA2 <- data.frame( ACE.D0.CTD=c(mxEval(a%*%t(a), ACE.ctd2.Fit$MZctd2), mxEval(d%*%t(d), ACE.ctd2.Fit$MZctd2), mxEval(c%*%t(c), ACE.ctd2.Fit$MZctd2),NA), ADSE.NTFD=c(mxEval(a%*%q2%*%t(a), ADSE.ntf2.Fit$MZntf2), mxEval(d%*%t(d), ADSE.ntf2.Fit$MZntf2), mxEval(s%*%t(s), ADSE.ntf2.Fit$MZntf2),ADSE.ntf2.Fit$MZntf2$CvSps$result), TRUTH=c(.60,.04,.06,.56), row.names=c("VA.est","VD.est","VC or VS.est","CV.Spouse")) RES.DATA2 <- round(as.matrix(RES.DATA2),3) #NOTE: Look at all the results together. Make sure that you understand why the estimates behave as they do given the simulated parameters RES.DATA1 RES.DATA2 # ----------------------------------------------------------------------- #END Section H: Total Results # ----------------------------------------------------------------------- ####################################################### ####################################################### # ----------------------------------------------------------------------- # START Section I: Fit ADFE + AM Nuclear Twin Family Design (NTFD) Model on Dataset 1 (including both parents) # ----------------------------------------------------------------------- #NOTE: We are now going to write an NTFD that estimates VF instead of VS. This makes the model a bit more complicated. We'll also use simulated data that includes a VF component. #Get the data (dataset 3) we will be using and set it up # ----------------------------------------------------------------------- #Download the simulated NTFD dataset 3 from the web con <- "http://www.matthewckeller.com/public/ntf3.data" ntf3.data <- read.table(con,header=TRUE,na.strings=-999) #note that -999 means "NA" in this dataset; na.strings=-999 has R change those to NAs nv <- 1 #Number of variables; this allows the same script to be used for multivariate models #Look at Data info(ntf3.data) #won't run if you haven't installed my user-defined functions above head(ntf3.data) str(ntf3.data) summary(ntf3.data) #Seperate out the MZ and DZ relative types mz.ntf3 <- ntf3.data[ntf3.data$twin.type=='MZ',3:6] #grabs rows where twin.type=="MZ" and columns 3 to 6 dz.ntf3 <- ntf3.data[ntf3.data$twin.type=='DZ',3:6] #grabs rows where twin.type=="DZ" and columns 3 to 6 #Summaries of data summary(mz.ntf3) summary(dz.ntf3) #Look at covariances (mz.fam.cv <- round(cov(mz.ntf3,use='pairwise'),3)) (dz.fam.cv <- round(cov(dz.ntf3,use='pairwise'),3)) # ----------------------------------------------------------------------- #Let's now fit an ADFE model # ----------------------------------------------------------------------- (VA.st.I <- .7) (VA.est.I <- TRUE) #TRUE" means we allow VA to be estimated freely (VD.st.I <- .2) (VD.est.I <- TRUE) #TRUE" means we allow VD to be estimated freely (VS.st.I <- 0) (VS.est.I <- FALSE) #FALSE" means we fix VS to be its start value (VF.st.I <- .1) (VF.est.I <- TRUE) #TRUE" means we allow VF to be estimated freely # ----------------------------------------------------------------------- #Set up the ADFE NTFD # ----------------------------------------------------------------------- # NOTE: we use the path coefficient model approach for ETFDs. We can't (at least I don't know how to) use the direct symmetric approach, where we directly estimate the variances, when there are non-linear constraints a <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VA.est.I,values=VA.st.I,lbound=1e-4,label="AddGenPath",name="a") d <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VD.est.I,values=VD.st.I,lbound=1e-4,label="DomPath",name="d") s <- mxMatrix(type="Lower",nrow=nv, ncol=nv,free=VS.est.I,values=VS.st.I,lbound=1e-4,label="SibPath", name="s") m <- mxMatrix(type="Lower",nrow=nv, ncol=nv,free=VF.est.I,values=VF.st.I,lbound=1e-4,label="VTPath", name="m") e <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=T,values=1.5,lbound=1e-4,label="EnvPath",name="e") mu <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=T,values=0,lbound=-.1,ubound=.7,label="AMCopath",name="mu") f <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=F,values=1,label="fPath",name="f") #Defining & equating nonlinear constraint for q (the variance of latent variable A) and x (the variance of latent variable F). q1 <- mxMatrix(type="Full",nrow=nv,ncol=nv,free=T,values=1,label="LatentVarAddGen",name="q1") q2 <- mxAlgebra(1 + delta %&% mu,name="q2") qCon <- mxConstraint(q1==q2,name='qCon') ########HERE IS THE FIRST SECTION YOU NEED TO COMPLETE - FILL IN THE ??? BY DERIVING THE EXPECTATION OF X x1 <- mxMatrix(type="Full",nrow=nv,ncol=nv,free=T,values=.3,label="LatentVarF",name="x1") x2 <- mxAlgebra(???,name="x2") xCon <- mxConstraint(x1==x2,name='xCon') #NOTE: Let's talk about how to derive the expectation of the variance of the F factor (x). An assumption of the NTFD (required for identification but also plausible) is that the variance components in the parental generation are the same as those in the offspring generation (i.e., that the variances have reached an equilibrium). Thus, to find x, we set it to be equal to all the chains that can be traced from and back to the same F latent variable in offspring. Start at, say, the F of twin 1, and find all legitimate chains that start there and end back there. ########HERE IS THE SECOND AND FINAL SECTION YOU NEED TO COMPLETE - FILL IN THE ??? BY DERIVING THE EXPECTATION OF W w1 <- mxMatrix(type="Full",nrow=nv,ncol=nv,free=T,values=.2,label="CovAF",name="w1") w2 <- mxAlgebra(???, name="w2") wCon <- mxConstraint(w1==w2,name='wCon') #NOTE: Let's talk about how to derive the expectation of the covariance between A and F (w). A covariance will develop between A and F anytime both are non-zero. This is because vertical transmission passes all the constituents of the parental phenotype, including A effects, to the offspring, inducing a covariance between A and F that is accentuated in the presence of assortative mating. By tracing all the pathways from A in twin 1 to F in twin 1, or A in twin 1 to F in twin 2 (surprisingly, they are equivalent), you should be able to find the nonlinear constraint for the covariance between A and F #mxAlgebra - Expected Variances and Relative Covariances delta <- mxAlgebra(q1 %*% a + w1 %*% f,name="delta") VP <- mxAlgebra(a %&% q1 + e %*% t(e) + d %*% t(d) + s %*% t(s) + f %&% x1 + 2 %x% a %*% w1 %*% f,name="VP") CvMz <- mxAlgebra(a %&% q1 + d %*% t(d) + s %*% t(s) + f %&% x1 + 2 %x% a %*% w1 %*% f,name="CvMz") #MZ CvDz <- mxAlgebra(a %&% (q1-.5) + .25 %x% d %*% t(d) + s %*% t(s) + f %&% x1 + 2 %x% a %*% w1 %*% f,name="CvDz") #DZ CvPO <- mxAlgebra(.5 %x% a %*% delta + .5 %x% a %*% delta %*% mu %*% VP + m %*% VP + m %*% VP %*% mu %*% VP,name="CvPO") #Parent-Offspring CvSps <- mxAlgebra(VP %&% mu,name="CvSps") #Spouse #Put these relative covariances together into MZ relatives and DZ relatives matrices MZ.rel.cv <- mxAlgebra(rbind( cbind(VP, CvMz, CvPO, CvPO), cbind(CvMz, VP, CvPO, CvPO), cbind(CvPO, CvPO, VP, CvSps), cbind(CvPO, CvPO, CvSps, VP)), dimnames=list(colnames(mz.ntf3),colnames(mz.ntf3)),name="expCovMZRels") DZ.rel.cv <- mxAlgebra(rbind( cbind(VP, CvDz, CvPO, CvPO), cbind(CvDz, VP, CvPO, CvPO), cbind(CvPO, CvPO, VP, CvSps), cbind(CvPO, CvPO, CvSps, VP)), dimnames=list(colnames(dz.ntf3),colnames(dz.ntf3)),name="expCovDZRels") #Put the relatives means together into means matrix (same for MZ and DZ rels) Both.means <- mxMatrix(type="Full", nrow=1, ncol=4*nv, free=TRUE, values= .01, label="mean",dimnames=list(NULL, colnames(mz.ntf3)), name="expMeanBoth") #Put in the data dataMZRel3 <- mxData(observed=mz.ntf3, type="raw") dataDZRel3 <- mxData(observed=dz.ntf3, type="raw") #Objectives for two groups objMZRels <- mxExpectationNormal(covariance="expCovMZRels",means="expMeanBoth", dimnames=c('tw1','tw2','father','mother')) objDZRels <- mxExpectationNormal(covariance="expCovDZRels",means="expMeanBoth", dimnames=c('tw1','tw2','father','mother')) #Combine groups myfitfun <- mxFitFunctionML() paramsI <- list(a,d,e,mu,s,m,f,VP,q1,q2,qCon,x1,x2,xCon,w1,w2,wCon,delta,CvMz,CvDz,CvPO,CvSps,Both.means,MZ.rel.cv,DZ.rel.cv,myfitfun) modelMZ.ntf3 <- mxModel("MZntf3",paramsI,dataMZRel3,objMZRels) modelDZ.ntf3 <- mxModel("DZntf3",paramsI,dataDZRel3,objDZRels) objI <- mxFitFunctionMultigroup(c("MZntf3","DZntf3")) ADFE.ntf3.Model <- mxModel("ntf3ADSE",modelMZ.ntf3,modelDZ.ntf3,objI) # ----------------------------------------------------------------------- #Run the model and get estimates # ----------------------------------------------------------------------- #Run the model ADFE.ntf3.Fit <- mxRun(ADFE.ntf3.Model,intervals=FALSE) (ADFE.ntf3.Fit.Summary <- summary(ADFE.ntf3.Fit)) #-2ll - ask your neighbors/breakout colleagues what theirs was ADFE.ntf3.Fit.Summary$Minus2LogLikelihood #Estimates & start values- as your neighbors/breakout colleagues what theirs were mxEval(a%*%q2%*%t(a), ADFE.ntf3.Fit$MZntf3) #VA (TRUE value = .49) mxEval(d%*%t(d), ADFE.ntf3.Fit$MZntf3) #VD (TRUE value = .10) mxEval(s%*%t(s), ADFE.ntf3.Fit$MZntf3) #VS (fixed @ 0) (TRUE value = 0) mxEval(x2, ADFE.ntf3.Fit$MZntf3) #x (TRUE value = .21) mxEval(w2, ADFE.ntf3.Fit$MZntf3) #w (TRUE value = .32) mxEval(q2, ADFE.ntf3.Fit$MZntf3) #q (TRUE value = 1.23) mxEval(e%*%t(e), ADFE.ntf3.Fit$MZntf3) #VE (TRUE value = .4) ADFE.ntf3.Fit$MZntf3$CvSps$result #CV.Sps (TRUE value = .47) ADFE.ntf3.Fit$MZntf3$VP$result #VP (TRUE value = 1.58) mxEval(a,ADFE.ntf3.Fit$MZntf3) #a (TRUE value = .63) # ----------------------------------------------------------------------- #NOTE: The PDF GeneEvolve.Dataset3.pdf provides information about the simulation. In models with AM and VT, the initial starting points of VA, VF, etc. supplied by the user will not be the final (equilibrium) parameters, and those are what is being estimated in an NTFD. To see those in a simulation, you need to directly measure them in the final generation of a simulation that has run for 10+ generations. To see those in the pdf, see the "Data" row of the "Unstandardized Variance Components" on p. 3. #NOTE: However, beware that COV(A,F) in that pdf refers to half of the amount that A-F covariance contributes to the phenotypic variance. I.e., the total amount that A-F covariance contributes to VP = 2awf, so what's printed in the pdf is true population value of awf. Given f=1, you can back calculate to get what w should be in our model; that's what I've written above as "w". #NOTE: unless your sample size is infinite, you don't expect your estimates from any given run of openMx to equal the parameter values exactly - even if unbiased, they will differ slightly from the expectations due to sampling error. Right now, we're looking to see if our estimates are in the right ballpark (i.e., the true parameters are within +/- 2 SEs of the estimates). If we wanted to check this formally, we'd run the simulation many times (e.g., 1000) in a "Monte Carlo" study and look at the average estimate across all runs. These averages should be VERY close to the population parameters. # ----------------------------------------------------------------------- # END Section I: Fit ADFE + AM NTFD Model on Dataset 3 (including both parents) # ----------------------------------------------------------------------- ######################################################