#THIS DOCUMENT CONTAINS A WORKING NTFD MODEL FITTING VF ON SIMULATED DATA.

# Matt Keller
# Workshop 2024


#######################################################
# -----------------------------------------------------------------------
# START: Fit ADFE + AM Nuclear Twin Family Design (NTFD) Model on Dataset 3 (including both parents)
# -----------------------------------------------------------------------

#NOTE: We are now going to write an NTFD that estimates VF instead of VS. This makes the model a bit more complicated. We'll also use simulated data that includes a VF component.


#Get the data (dataset 3) we will be using and set it up
# -----------------------------------------------------------------------

#Download the simulated NTFD dataset 3 from the web
con <- "http://www.matthewckeller.com/public/ntf3.data"
ntf3.data <- read.table(con,header=TRUE,na.strings=-999) #note that -999 means "NA" in this dataset; na.strings=-999 has R change those to NAs
nv <- 1 #Number of variables; this allows the same script to be used for multivariate models

#Look at Data
dim(ntf3.data) 
head(ntf3.data)
str(ntf3.data)
summary(ntf3.data)  

#Seperate out the MZ and DZ relative types
mz.ntf3 <- ntf3.data[ntf3.data$twin.type=='MZ',3:6] #grabs rows where twin.type=="MZ" and columns 3 to 6
dz.ntf3 <- ntf3.data[ntf3.data$twin.type=='DZ',3:6] #grabs rows where twin.type=="DZ" and columns 3 to 6

#Summaries of data
summary(mz.ntf3)
summary(dz.ntf3)

#Look at covariances
(mz.fam.cv <- round(cov(mz.ntf3,use='pairwise'),3))
(dz.fam.cv <- round(cov(dz.ntf3,use='pairwise'),3))
# -----------------------------------------------------------------------


#Let's now fit an ADFE model
# -----------------------------------------------------------------------
(VA.st <- .7)
(VA.est <- TRUE) #TRUE" means we allow VA to be estimated freely
(VD.st <- .2)
(VD.est <- TRUE) #TRUE" means we allow VD to be estimated freely
(VS.st <- 0)
(VS.est <- FALSE) #FALSE" means we fix VS to be its start value
(VF.st <- .1)
(VF.est <- TRUE) #TRUE" means we allow VF to be estimated freely
# -----------------------------------------------------------------------


#Set up the ADFE NTFD
# -----------------------------------------------------------------------
# NOTE: we use the path coefficient model approach for ETFDs. We can't (at least I don't know how to) use the direct symmetric approach, where we directly estimate the variances, when there are non-linear constraints 
a <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VA.est,values=VA.st,lbound=1e-4,label="AddGenPath",name="a")
d <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=VD.est,values=VD.st,lbound=1e-4,label="DomPath",name="d")
s <- mxMatrix(type="Lower",nrow=nv, ncol=nv,free=VS.est,values=VS.st,lbound=1e-4,label="SibPath", name="s")
m <- mxMatrix(type="Lower",nrow=nv, ncol=nv,free=VF.est,values=VF.st,lbound=1e-4,label="VTPath", name="m")
e <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=T,values=1.5,lbound=1e-4,label="EnvPath",name="e")
mu <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=T,values=0,lbound=-.1,ubound=.7,label="AMCopath",name="mu")
f <- mxMatrix(type="Lower",nrow=nv,ncol=nv,free=F,values=1,label="fPath",name="f")


#Defining & equating nonlinear constraint for q (the variance of latent variable A) and x (the variance of latent variable F).
q1 <- mxMatrix(type="Full",nrow=nv,ncol=nv,free=T,values=1,label="LatentVarAddGen",name="q1")
q2 <- mxAlgebra(1 + delta %&% mu,name="q2")
qCon <- mxConstraint(q1==q2,name='qCon')   

x1 <- mxMatrix(type="Full",nrow=nv,ncol=nv,free=T,values=.3,label="LatentVarF",name="x1")
x2 <- mxAlgebra(2 %x% m %*% VP %*% m + 2 %x% m %*% VP %*% mu %*% VP %*% m,name="x2")
xCon <- mxConstraint(x1==x2,name='xCon') 

w1 <- mxMatrix(type="Full",nrow=nv,ncol=nv,free=T,values=.2,label="CovAF",name="w1")
w2 <- mxAlgebra(m %*%  delta + m %*% VP %*% mu %*% delta, name="w2")
wCon <- mxConstraint(w1==w2,name='wCon') 


#mxAlgebra - Expected Variances and Relative Covariances
delta <- mxAlgebra(q1 %*% a + w1 %*% f,name="delta")
VP <- mxAlgebra(a %&% q1 + e %*% t(e) + d %*% t(d) + s %*% t(s) + f %&% x1 + 2 %x% a %*% w1 %*% f,name="VP")
CvMz <- mxAlgebra(a %&% q1 + d %*% t(d) + s %*% t(s) + f %&% x1 + 2 %x% a %*% w1 %*% f,name="CvMz") #MZ
CvDz <- mxAlgebra(a %&% (q1-.5) + .25 %x% d %*% t(d) + s %*% t(s) + f %&% x1 + 2 %x% a %*% w1 %*% f,name="CvDz") #DZ
CvPO <- mxAlgebra(.5 %x% a %*% delta + .5 %x% a %*% delta %*% mu %*% VP + m %*% VP + m %*% VP %*% mu %*% VP,name="CvPO") #Parent-Offspring
CvSps <- mxAlgebra(VP %&% mu,name="CvSps") #Spouse

#Put these relative covariances together into MZ relatives and DZ relatives matrices
MZ.rel.cv <-    mxAlgebra(rbind(
  cbind(VP,   CvMz, CvPO,  CvPO),
  cbind(CvMz, VP,   CvPO,  CvPO),
  cbind(CvPO, CvPO, VP,    CvSps),
  cbind(CvPO, CvPO, CvSps, VP)),
  dimnames=list(colnames(mz.ntf3),colnames(mz.ntf3)),name="expCovMZRels")

DZ.rel.cv <-    mxAlgebra(rbind(
  cbind(VP,   CvDz, CvPO,  CvPO),
  cbind(CvDz, VP,   CvPO,  CvPO),
  cbind(CvPO, CvPO, VP,    CvSps),
  cbind(CvPO, CvPO, CvSps, VP)),
  dimnames=list(colnames(dz.ntf3),colnames(dz.ntf3)),name="expCovDZRels")

#Put the relatives means together into means matrix (same for MZ and DZ rels)
Both.means <-   mxMatrix(type="Full", nrow=1, ncol=4*nv, free=TRUE,  values= .01, label="mean",dimnames=list(NULL, colnames(mz.ntf3)), name="expMeanBoth")

#Put in the data
dataMZRel3 <- mxData(observed=mz.ntf3, type="raw")
dataDZRel3 <- mxData(observed=dz.ntf3, type="raw")

#Objectives for two groups
objMZRels <- mxExpectationNormal(covariance="expCovMZRels",means="expMeanBoth", dimnames=c('tw1','tw2','father','mother'))
objDZRels <- mxExpectationNormal(covariance="expCovDZRels",means="expMeanBoth", dimnames=c('tw1','tw2','father','mother'))

#Combine groups
myfitfun <- mxFitFunctionML()
paramsI <- list(a,d,e,mu,s,m,f,VP,q1,q2,qCon,x1,x2,xCon,w1,w2,wCon,delta,CvMz,CvDz,CvPO,CvSps,Both.means,MZ.rel.cv,DZ.rel.cv,myfitfun)
modelMZ.ntf3 <- mxModel("MZntf3",paramsI,dataMZRel3,objMZRels)
modelDZ.ntf3 <- mxModel("DZntf3",paramsI,dataDZRel3,objDZRels)
objI <- mxFitFunctionMultigroup(c("MZntf3","DZntf3"))
ADFE.ntf3.Model <- mxModel("ntf3ADSE",modelMZ.ntf3,modelDZ.ntf3,objI)
# -----------------------------------------------------------------------


#Run the model and get estimates
# -----------------------------------------------------------------------
#Run the model
ADFE.ntf3.Fit <- mxRun(ADFE.ntf3.Model,intervals=FALSE)
(ADFE.ntf3.Fit.Summary <- summary(ADFE.ntf3.Fit))

#-2ll
ADFE.ntf3.Fit.Summary$Minus2LogLikelihood

#Estimates & start values- as your neighbors/breakout colleagues what theirs were
mxEval(a%*%q2%*%t(a), ADFE.ntf3.Fit$MZntf3) #VA (TRUE value = .49)
mxEval(d%*%t(d), ADFE.ntf3.Fit$MZntf3)      #VD (TRUE value = .10)
mxEval(s%*%t(s), ADFE.ntf3.Fit$MZntf3)      #VS (fixed @ 0) (TRUE value = 0)
mxEval(x2, ADFE.ntf3.Fit$MZntf3)            #x  (TRUE value = .21)
mxEval(w2, ADFE.ntf3.Fit$MZntf3)            #w  (TRUE value = .32)
mxEval(q2, ADFE.ntf3.Fit$MZntf3)            #q  (TRUE value = 1.23)
mxEval(e%*%t(e), ADFE.ntf3.Fit$MZntf3)      #VE (TRUE value = .4)
ADFE.ntf3.Fit$MZntf3$CvSps$result           #CV.Sps (TRUE value = .47)
ADFE.ntf3.Fit$MZntf3$VP$result              #VP (TRUE value = 1.58)
mxEval(a,ADFE.ntf3.Fit$MZntf3)              #a  (TRUE value = .63)
# -----------------------------------------------------------------------

#NOTE: The PDF GeneEvolve.Dataset3.pdf provides information about the simulation. In models with AM and VT, the initial starting points of VA, VF, etc. supplied by the user will not be the final (equilibrium) parameters, and those are what is being estimated in an NTFD. To see those in a simulation, you need to directly measure them in the final generation of a simulation that has run for 10+ generations. To see those in the pdf, see the "Data" row of the "Unstandardized Variance Components" on p. 3. 
#NOTE: However, beware that COV(A,F) in that pdf refers to half of the amount that A-F covariance contributes to the phenotypic variance. I.e., the total amount that A-F covariance contributes to VP = 2awf, so what's printed in the pdf is true population value of awf. Given f=1, you can back calculate to get what w should be in our model; that's what I've written above as "w"


# -----------------------------------------------------------------------
# END: Fit ADFE + AM NTFD Model on Dataset 3 (including both parents)
# -----------------------------------------------------------------------
######################################################