# Multiple-indicator DOC Model
# MadhurBain Singh, Nathan Gillespie, Hermine H. Maes
# March 8, 2024

## Qualtrics:
# https://qimr.az1.qualtrics.com/jfe/form/SV_eW1CeUT3ZibFMzQ

library(OpenMx)
# mxOption(NULL, 'Number of Threads', parallel::detectCores())
mxOption(NULL, "Default optimizer", "CSOLNP")
mxVersion()


# Select Data ==============================================================
dataFile <- "~/doc/Data/ACE_DoC_model_sim_data.tsv"
docData  <- read.table(dataFile, header = T)

## Look at the data
head(docData)  ## The first few rows
dim(docData)   ## Dimensions: nr of rows and nr of columns

## Check the zygosity distribution
table(docData$zyg)

## Note the variable names (for one twin) from the data
## Observed/manifest indicator variables in the dataset
vars <- c(
  ## 3 Indicators of Phenotype X
  "X1","X2", "X3", 
  ## 3 Indicators of Phenotype Y
  "Y1", "Y2", "Y3"
)

nv   <- length(vars)	# Number of manifest/observed variables per twin

## Duplicate the variable names for both twins
## Add appropriate suffix to the variable names to indicate twin 1 and twin 2
## Order of variables: all vars for T1, then all vars for T2 in the same order
selVars <- paste(vars, 
                 c(rep("T1",nv),rep("T2",nv)), 
                 sep = "_")
selVars

## Latent common factors, i.e., the true score (one for each phenotype)
latFacs <- c("Fx","Fy")
nFac		<- length(latFacs)	# Number of latent factor true scores or common pathways 		

## Subset Data by zygosity
## Keep the selected variables
mzdata		<- subset(docData, 
                  subset = docData$zyg == "MZ", 
                  select = selVars)
dzdata		<- subset(docData, 
                  subset = docData$zyg == "DZ", 
                  select = selVars)
str(mzdata)
str(dzdata)


# Descriptive Statistics ======================================================

psych::describe(mzdata)
psych::describe(dzdata)

# Print Means
round(colMeans(mzdata,na.rm=TRUE),4)
round(colMeans(dzdata,na.rm=TRUE),4)

# Cross-twin within-trait correlations
twin_pair_corrs <- cbind(
  diag2vec(round(cor(mzdata[selVars],use="complete"),4)[7:12,1:6]),
  diag2vec(round(cor(dzdata[selVars],use="complete"),4)[7:12,1:6])
)
colnames(twin_pair_corrs)	<- c("MZ","DZ")
rownames(twin_pair_corrs)	<- vars 
twin_pair_corrs
#        MZ     DZ
# X1 0.4786 0.2514
# X2 0.4695 0.2420
# X3 0.4771 0.2533
# Y1 0.5441 0.4052
# Y2 0.5336 0.3953
# Y3 0.5197 0.4008


# Matrices in the full D0C model =================================

# Vf = common factor covariance matrix (nFac*nFac) (3 componenets: Af, Cf, Ef)
# Β  = causal parameters (nFac*nFac)
# I  = Identity matrix   (nFac*nfac)
# Fl = factor loadings   (nv*nFac)
# Vs = Manifest indicator-specific covariance (nv*nv) (3 componenets: As, Cs, Es)
#     (including measurement error)


# Build multiple-indicator ACE model for two latent phenotypes ===========

## Specifies three sources of covariance: covA, covC, covE
## i.e., all model-expected covariance is due to background covariance (confounding), genetic and environmental
## There is no causal path between the phenotypes
## The best-fitting covariance-only model is your "null" model for the DoC model.


## ACE Variance of the latent common factors ================================

# A, C & E variance components of the latent factors (= true phenotype scores)
# NB: direction of causation goes DOWN the column & OUT along the row
# Matrix dimension = nFac x nFac

aFree		<- c(T,T,T) 
cFree		<- c(T,T,T) 
eFree		<- c(T,T,T)  

aVals		<- c(0.4,0.2,0.4)        
cVals		<- c(0.2,0.1,0.2)        
eVals		<- c(0.4,0.2,0.4)        

Af 		<- mxMatrix(type = "Symm", nrow = nFac,ncol=nFac, labels=c("Af_x","covAf","Af_y"), 
                 free=aFree, values=aVals, dimnames = list(latFacs,latFacs),
                 name="Af")
Cf 		<- mxMatrix(type = "Symm", nrow = nFac,ncol=nFac, labels=c("Cf_x","covCf","Cf_y"), 
                 free=cFree, values=cVals, dimnames = list(latFacs,latFacs),
                 name="Cf") 
Ef 		<- mxMatrix(type = "Symm", nrow = nFac,ncol=nFac, labels=c("Ef_x","covEf","Ef_y"), 
                 free=eFree, values=eVals, dimnames = list(latFacs,latFacs),
                 name="Ef") 

## Examine the matrices
## Dimensions, parameter labels, free/fixed parameters, starting values
Af
Cf
Ef


## B - causal parameters between latent factors ================================

# NB: direction of causation goes DOWN the column & OUT the row
# Matrix dimension = nFac x nFac

## We will initially fit a diphenotype twin model (two latent phenotypes) without causal paths
## So, the regression/causal paths (in the B matrix) are all fixed at 0 (for now).

Bfree   	<- matrix(c(F,F,  
                     F, F),
                   byrow = T,
                   nrow = nFac,
                   ncol = nFac, 
                   dimnames = list(latFacs,latFacs))
Bvals   	<- matrix(c(0,0,
                     0, 0),
                   byrow = T,
                   nrow = nFac,
                   ncol = nFac, 
                   dimnames = list(latFacs,latFacs))  	
Blabs	 	<- matrix(c(NA,"bxy",
                    "byx",NA),
                  byrow = T,
                  nrow = nFac,
                  ncol = nFac, 
                  dimnames = list(latFacs,latFacs))							  		  	   
B    	  <- mxMatrix(type="Full", nrow=nFac, ncol=nFac, 
                    labels=Blabs, free=Bfree, values=Bvals, 
                    dimnames = list(latFacs,latFacs), name = "B" )
I       <- mxMatrix(type="Iden", nrow=nFac, ncol=nFac, 
                    dimnames = list(latFacs,latFacs), name = "I")

## Examine the matrices
## Dimensions, parameter labels, free/fixed parameters, starting values
B
I


## Factor Loadings ====================================================
# Factor loadings from the latent true scores to the observed (indicator) phenotypes
# Matrix dimension = nv x nFac

Fl_free   	<- matrix(c(  T,F,
                         T,F,
                         T,F,
                         F,T,
                         F,T,
                         F,T),
                     byrow = T,
                     nrow = nv,
                     ncol = nFac,
                     dimnames = list(vars,latFacs))						  

Fl_vals   	<- matrix(c(  0.8,0.0,
                         0.8,0.0,
                         0.8,0.0,
                         0.0,0.8,
                         0.0,0.8,
                         0.0,0.8),
                     byrow = T,
                     nrow = nv,
                     ncol = nFac,
                     dimnames = list(vars,latFacs))						

Fl_labs   	<- matrix(c( "f_x1Fx", NA,
                        "f_x2Fx", NA,
                        "f_x3Fx", NA,
                        NA,      "f_y1Fy",
                        NA,      "f_y2Fy",
                        NA,      "f_y3Fy"),
                     byrow = T,
                     nrow = nv,
                     ncol = nFac,
                     dimnames = list(vars,latFacs))						  						  
Fl   	<- mxMatrix(type="Full", nrow=nv, ncol=nFac, 
                  free = Fl_free, 
                  values = Fl_vals, labels=Fl_labs,
                  dimnames = list(vars,latFacs), 
                  name = "Fl")

## Examine the matrix
## Dimensions, parameter labels, free/fixed parameters, starting values
Fl


## Observed Indicator-Specific Variance =====================================

# Variance of the observed/manifest indicator variable not explained by the latent true score
# Matrix dimension = nv x nv

As_labs	<- c("As_X1","As_X2","As_X3","As_Y1","As_Y2","As_Y3")
Cs_labs	<- c("Cs_X1","Cs_X2","Cs_X3","Cs_Y1","Cs_Y2","Cs_Y3") 
Es_labs	<- c("Es_X1","Es_X2","Es_X3","Es_Y1","Es_Y2","Es_Y3") 

As 	<- mxMatrix(type = "Diag", nrow=nv, ncol=nv, 
                labels=As_labs, free=T, values=0.05,
                dimnames = list(vars,vars), name="As") 
Cs 	<- mxMatrix(type = "Diag", nrow=nv, ncol=nv, 
                labels=Cs_labs, free=T, values=0.05,
                dimnames = list(vars,vars), name="Cs") 
Es 	<- mxMatrix(type = "Diag", nrow=nv, ncol=nv, 
                labels=Es_labs, free=T, lbound=0, values=0.1,
                dimnames = list(vars,vars), name="Es") 

## Es captures the measurement error in the observed variables
## We put a lower bound of 0 on the estimates of Es
## as measurement error >= 0


## Note that these are diagonal matrices.
## i.e., it has the all off-diagonal elements set at 0
## i.e., we are assuming that there is no covariance between the observed indicators
## above and beyond the covariance due to the common factors (the latent phenotypes)
## This also means that we assume that the measurement error in one indicator
## is not correlated with the measurement error in the other indicators.

## Examine the matrices
## Dimensions, parameter labels, free/fixed parameters, starting values
As
Cs
Es


## Expected covariance ======================================

# Matrix dimension = nv x nv
# Three separate matrices = A, C, E
# Algebra = ( Fl * (I-Β)~ * Vf * (I-Β)~' * Fl' ) + Vs
# Fl = factor loadings
# I  = Identity matrix
# Β  = causal parameters
# Vf = common factor (co)variance matrix (= Af | Cf | Ef)
# Vs = Manifest indicator-specific (co)variance matrix (= As | Cs | Es)

A 			<- mxAlgebra( Fl %&% (solve(I-B) %&% Af) + As, 
                   dimnames = list(vars,vars), name = "A")
C 			<- mxAlgebra( Fl %&% (solve(I-B) %&% Cf) + Cs, 
                   dimnames = list(vars,vars), name = "C")
E 			<- mxAlgebra( Fl %&% (solve(I-B) %&% Ef) + Es, 
                   dimnames = list(vars,vars), name = "E") 

# covMZ = A+C
covMZ   <- mxAlgebra( expression= rbind( cbind(A+C+E, A+C), 
                                         cbind(A+C,   A+C+E)), 			
                      dimnames = list(selVars,selVars), name="expCovMZ" )										 

# covDZ = 0.5*A+C
covDZ   <- mxAlgebra( expression= rbind( cbind(A+C+E,     0.5%x%A+C), 
                                         cbind(0.5%x%A+C, A+C+E)), 
                      dimnames = list(selVars,selVars), name="expCovDZ" )


## Standardized variance Components ============================================

# Total phenotypic variance, V = A + C + E
V      	<- mxAlgebra( expression= A+C+E, name="V" )

# Standardized variance components of individual indicators = ( variance component / total variance )
rowVC   <- rep('VC', nv)
colVC   <- rep(c('A','C','E','SA','SC','SE'),each=nv)
VC      <- mxAlgebra( expression=cbind(A,C,E,A/V,C/V,E/V), dimnames=list(rowVC,colVC), name="VC")


# Constrain the total variance of each latent common factor (Vf) to be == 1
unitM		<- mxMatrix( type="Unit", nrow=nFac, ncol=1, name="unitM" )
Vf      <- mxAlgebra( expression = 
                        (solve(I-B) %&% Af) +
                        (solve(I-B) %&% Cf) +
                        (solve(I-B) %&% Ef)  , name="Vf" )
ConVar 	<- mxConstraint( expression = diag2vec(Vf)==unitM, name="ConVar" ) 

## NB! We need to either fix the latent factor variance
##     or one of its factor loadings at some value (usually fixed at 1).
## Here, we are doing the former (fixing the factor variance at 1). 


## Latent true score means =====================================

# Mean matrix = μ
# Matrix dimension = 1*nv

Mean 	  	<- mxMatrix( type="Full", nrow=1, ncol=nv, 
                      free=TRUE, labels=paste0("mu_",vars), 
                      values = c(0.02,0.02,0.02,0.25,0.25,0.25),
                      dimnames = list("mu",vars), name = "Mean" ) # μ
## Examine the matrix
## Dimensions, parameter labels, free/fixed parameters, starting values
Mean 

## Duplicate the means for the co-twin (i.e., the means are assumed to be equal across twin-order)
expMean  	<- mxAlgebra( expression= cbind(Mean,Mean), name="expMean" )


## Data and Expectation Objects for Multiple Groups ===========================

# Data Objects for Multiple Groups
dataMZ   	<- mxData( observed=mzdata, type="raw" )
dataDZ   	<- mxData( observed=dzdata, type="raw" )

# Expectation Objects for Multiple Groups
# Define how the model expectations are calculated 
# The mxExpectationNormal function uses the algebra defined by the 'covariance' and 'means' arguments to define 
# the expected covariance and means under the assumption of multivariate normality
objMZ    	<- mxExpectationNormal( covariance="expCovMZ", means="expMean", dimnames=selVars )
objDZ    	<- mxExpectationNormal( covariance="expCovDZ", means="expMean", dimnames=selVars )

# Function to compute -2*(log likelihood) of the data given the current values of the free parameters and the expectation function
fiML    	<- mxFitFunctionML()

## There are five possible sources of covariance between (latent true scores of) X and Y
## Three possible sources of background confounding (covA, covC, covE)
## and two possible causal effects (byx, bxy).
## Save the object to estimate CIs for all five.
## In our best-fitting model, We can estimate the CIs of the parameters
## that end being freely estimated in that model.

ci 	    	<- mxCI(c("Af_x","Cf_x","Ef_x","Af_y","Cf_y","Ef_y",
                   "bxy","byx","covAf","covCf","covEf"))


## Combine all objects into a model ===========================================

# Final steps
## Objects common to both submodels (MZ and DZ)
pars    	<- list( 
  ## Latent factor covariance matrices
  Af,Cf,Ef, 
  ## Causal paths
  I,B,
  ## Factor loadings
  Fl, 
  ## Observed-indicator-specific covariance matrices
  As,Cs,Es, 
  ## Total/combined covariance matrices
  A,C,E, 
  ## Objects for the means
  Mean,expMean, 
  ## Objects for standardizing the variance component estimates
  V,VC, 
  ## Constraint for the variance of the latent factors
  unitM,Vf,ConVar,  
  ## object for confidence intervals
  ci 
)  

## Add MZ-specific objects to the MZ submodel
modelMZ 	<- mxModel( pars, dataMZ, covMZ, objMZ, fiML, 
                     name="MZ" ) # Function to create MxModel objects 

## Add DZ-specific objects to the DZ submodel
modelDZ 	<- mxModel( pars, dataDZ, covDZ, objDZ, fiML, 
                     name="DZ" ) # Function to create MxModel objects 

## Combine the submodels into one model
obj   		<- mxFitFunctionMultigroup(c("MZ","DZ"))				    # Function used to fit a multiple group model
ACE_model	<- mxModel( "ACE", pars, modelMZ, modelDZ, obj )

## Final check of the starting values of all free parameters
omxGetParameters(ACE_model)


# TUTORIAL ####################################################################

## ACE Model with the Two Latent True Scores ======================

ACE_fit 	<- mxRun( ACE_model )

## Look at the standardized ACE estimates of all six observed variables
var_comp <- cbind(
  round(diag2vec(ACE_fit$algebras$VC$result[1:6,19:24]),2),
  round(diag2vec(ACE_fit$algebras$VC$result[1:6,25:30]),2),
  round(diag2vec(ACE_fit$algebras$VC$result[1:6,31:36]),2))
rownames(var_comp) <- vars 
colnames(var_comp) <- c("A","C","E") 	
var_comp


# 1. Find the best-fitting ACE model (covariance-only) ============================

## In this tutorial we will perform "global" tests of A and C variances
## i.e., test the joint statistical significance of A (or C, or A+C)
## for both latent true scores, plus all observed indicator variables.

### Global AE - correlated background A & E only, no causation =================

## To test the joint significance of all C components
## Drop the shared environmental (C) variance components of the latent true scores
## Drop the shared environmental (C) variance components of the observed indicators

AE 				<- mxModel( ACE_fit,name="AE")
AE 				<- omxSetParameters( AE, label=c("Cf_x","covCf","Cf_y"),free=F,values=0)   
AE 				<- omxSetParameters( AE, label=Cs_labs,free=F,values=0)  		
AE_fit  	<- mxRun( AE )
summary(AE_fit)

### Global CE - correlated background C & E only, no causation =================

## To test the joint significance of all A components
## Drop the additive genetic (A) variance components of the latent true scores
## Drop the additive genetic (A) variance components of the observed indicators

CE 				<- mxModel( ACE_fit,name="CE") 
CE 				<- omxSetParameters( CE, label=c("Af_x","covAf","Af_y"),free=F,values=0)   
CE 				<- omxSetParameters( CE, label=As_labs,free=F,values=0)  		 		
CE_fit	  <- mxRun( CE )
summary(CE_fit)

### Global E - correlated background E only, no causation ======================

## To test the joint significance of all A and all C components
## Drop the additive genetic (A) variance components of the latent true scores
## Drop the additive genetic (A) variance components of the observed indicators
## Drop the shared environmental (C) variance components of the latent true scores
## Drop the shared environmental (C) variance components of the observed indicators

E 					<- mxModel( AE_fit,name="E") 
E 	  			<- omxSetParameters( E, label=c("Af_x","covAf","Af_y"),free=F,values=0)   
E 				  <- omxSetParameters( E, label=As_labs,free=F,values=0)  		
E_fit	      <- mxRun( E )
summary(E_fit)

## Note the missing Std Errors of Ef_x and Ef_y.
## In this model, Ef_x = total variance of Fx
## As we have fixed the total variance of Fx at 1.
## Ef_x also, indirectly, becomes fixed at 1.
## Hence, Ef_x and Ef_y are no longer freely estimated, and so have no S.E.


##### Compare ACE with global AE, CE and E covariance models (no causation) ==========

subs 				<- c( AE_fit, CE_fit, E_fit )
comps				<- mxCompare( ACE_fit,subs )
comps

## These models assume no causation between X and Y.
## So, the best-fitting model is our "null" model for further causal tests.

## Specify the best-fitting model as "null" model
## For example, if ACE_fit is the best-fitting model,
## nullACE_fit <- ACE_fit

nullACE_fit <- ???

## Now, examine the ACE estimates of X and Y.
## Refit the model with intervals=TRUE
nullACE_fit <- mxRun(nullACE_fit, intervals = T)
round(nullACE_fit$output$confidenceIntervals,3)

## NB: If the two phenotypes have similar ACE estimates, 
## this pair of phenotypes may not be suitable for DoC modeling!


# 2. Test the DoC model (No background covariance/confounding) ================

## We'll now test the DoC model, i.e., the causal paths between Fx and Fy


### Causation from Fx to Fy. No background ACE covariance ===============
Fx_to_Fy 				<- mxModel( nullACE_fit, name="ACE_byx")
## Drop the background ACE covariances in the null model
Fx_to_Fy 				<- omxSetParameters( Fx_to_Fy, label=c("covAf","covCf","covEf"),free=FALSE,values=0)  
## Freely estimate the causal path from Fx to Fy (byx)
Fx_to_Fy 				<- omxSetParameters( Fx_to_Fy, label="byx",free=TRUE,values=0.2)  
Fx_to_Fy_fit  	<- mxRun( Fx_to_Fy )
summary( Fx_to_Fy_fit )


### Causation from Fy to Fx. No background ACE covariance ===============
Fy_to_Fx 				<- mxModel( nullACE_fit, name="ACE_bxy")
## Drop the background ACE covariances in the null model
Fy_to_Fx 				<- omxSetParameters( Fy_to_Fx, label=c("covAf","covCf","covEf"),free=FALSE,values=0)  
## Freely estimate the causal path from Fy to Fx (bxy)
Fy_to_Fx 				<- omxSetParameters( Fy_to_Fx, label="bxy",free=TRUE,values=0.2)  
Fy_to_Fx_fit  	<- mxTryHard( Fy_to_Fx )
summary( Fy_to_Fx_fit )


### Bidirectional Causation between Fy to Fx. No background ACE covariance ===============
recip_doc 					<- mxModel( nullACE_fit,name="ACE_recip")
## Drop the background ACE covariances in the null model
recip_doc 					<- omxSetParameters( recip_doc, label=c("covAf","covCf","covEf"),free=FALSE,values=0)  
## Estimate the causal path from Fx to Fy (byx)
recip_doc 					<- omxSetParameters( recip_doc, label=c("byx"),free=TRUE,values=0.2)
## as well as the causal path from Fy to Fx (bxy)
recip_doc 					<- omxSetParameters( recip_doc, label=c("bxy"),free=TRUE,values=0.1) 
recip_doc_fit	    	<- mxRun( recip_doc )
summary( recip_doc_fit	)


##### Compare the fit of the DoC models against the null ========================
subs 						<- c( Fx_to_Fy_fit, Fy_to_Fx_fit, recip_doc_fit )
comps						<- mxCompare( nullACE_fit, subs )
comps

## We may also compare the X_to_Y DOC model to the reciprocal DOC model
mxCompare( recip_doc_fit, Fx_to_Fy_fit )

## What are the model-implied sources of covariance between X and Y
## in the best-fitting model?
## **covariance may be due to causation or background confounding.

## Estimate the confidence intervals of these parameters in the best-fitting model.
## Fill in the fitted model object of the best-fitting model
## e.g., if Fx_to_Fy_fit is the best-fitting model,
## best_fit <- Fx_to_Fy_fit

best_fit <- ???
best_fit <- mxRun( best_fit, intervals = TRUE)
round(best_fit$output$confidenceIntervals, 3)


# BONUS: Test a hybrid model (causation + confounding) =============================

## In "bivariate" twin models, we can usually estimate up to **three** sources of covariance
## between the two phenotypes (in this example, the two latent true scores).
## For example, we may freely estimate background A covariance, plus reciprocal causation

## Try hybrid model covA, byx, bxy
ACE_hybrid 					<- mxModel( ACE_fit,name="Hybrid_covA_byx_bxy")
## Drop the background confounding due to C and E
ACE_hybrid 					<- omxSetParameters( ACE_hybrid, label=c("covCf","covEf"),free=FALSE,values=0)  
## Estimate the bidirectional causal paths between Fx and Fy
ACE_hybrid 					<- omxSetParameters( ACE_hybrid, label=c("byx","bxy"),free=TRUE,values=c(0.2,0.1))  
ACE_hybrid_fit	    <- mxRun( ACE_hybrid, intervals = T )
summary( ACE_hybrid_fit	)

round(ACE_hybrid_fit$output$confidenceIntervals, 3)

# Compare the model fit
# NB: the causation-only DOC model is a restricted, nested model of the hybrid model
mxCompare( ACE_hybrid_fit, Fx_to_Fy_fit ) 


# END. ===========