# Load the necessary packages
require(MASS)
require(OpenMx)
require(psych)

# load the functions to do the analyses 
source("lgcFun.R")

# Set your working directory
setwd("You_Must_Change_This_Text")

# Read in the data
lgcData_1 <- as.data.frame(read.table("lgcData_1.txt", header = T))
lgcData_2 <- as.data.frame(read.table("lgcData_2.txt", header = T))
lgcData_3 <- as.data.frame(read.table("lgcData_3.txt", header = T))

# Look at the data 
head(lgcData_1)
head(lgcData_2)
head(lgcData_3)

# Create two data objects (one for MZ and another for DZ twin families)
mzData_1 <- lgcData_1[lgcData_1$zyg == "MZ" ,]
dzData_1 <- lgcData_1[lgcData_1$zyg == "DZ" ,]

mzData_2 <- lgcData_2[lgcData_2$zyg == "MZ" ,]
dzData_2 <- lgcData_2[lgcData_2$zyg == "DZ" ,]

mzData_3 <- lgcData_3[lgcData_3$zyg == "MZ" ,]
dzData_3 <- lgcData_3[lgcData_3$zyg == "DZ" ,]


## Next we need to tell R what variables we plan on analyzing. 
# There are 6 waves and two twins. 
# Each wave is labels [trial1, trial2, trial3 ...]. 
# Twins are identified by a suffix _1 or _2 [e.g. "trial1_1" indicates for wave 1 for twin 1 or "trial4_2" indicataes wave 4 for twin 2]

selVars <- colnames(lgcData_1)[1:12]
selVars

dataMZ <- mzData_1
dataDZ <- dzData_1

# Run the Linear ACE LGM model
linACElgcModel <- linACElgc(mzData = dataMZ, dzData = dataDZ, selVars  = selVars, nWaves = 6)
linACElgcFit <- mxRun(linACElgcModel)
linACElgcSumm <- summary(linACElgcFit )
linACElgcSumm

# Run the Quadratic ACE LGM model
quadACElgcModel <- quadACElgc(mzData = dataMZ, dzData = dataDZ, selVars  = selVars, nWaves = 6)
quadACElgcFit <- mxRun(quadACElgcModel)
quadACElgcSumm <- summary(quadACElgcFit )
quadACElgcSumm

# Run the Semiparametric ACE LGM model 
semiACElgcModel <- semiACElgc(mzData = dataMZ, dzData = dataDZ, selVars  = selVars, nWaves = 6)
semiACElgcFit <- mxRun(semiACElgcModel)
semiACElgcSumm <- summary(semiACElgcFit )
semiACElgcSumm

# Compare the models and determine which model fits best for each datset.
InfoCriteria <- rbind(
c(t(linACElgcSumm$informationCriteria))      ,
c(t(quadACElgcSumm$informationCriteria))     ,
c(t(semiACElgcSumm$informationCriteria))     )

colnames(InfoCriteria) <- c("df Penalty  AIC", "Parameters Penalty AIC", "Sample-Size Adjusted AIC", "df Penalty  BIC", "Parameters Penalty BIC", "Sample-Size Adjusted BIC")
rownames(InfoCriteria) <- c("Linear LGC", "Quadratic LGC", "Semiparametric LGC")
InfoCriteria

####

# While the fit statistics make a compelling case for selecting the best fitting model, it is instructive to plot the expected means for each model to understand WHY one model fits better than another.

# Make a figure to visualize the differences in the expected means of the growth parameters 

# reshape the data so that you can get a plot of several observed trajectories
nWaves <- 6
mzT1 <- dataMZ[,1:6]  ; colnames(mzT1) <- paste0("trial", 1:nWaves)
mzT2 <- dataMZ[,7:12] ; colnames(mzT2) <- paste0("trial", 1:nWaves)
dzT1 <- dataMZ[,1:6]  ; colnames(dzT1) <- paste0("trial", 1:nWaves)
dzT2 <- dataMZ[,7:12] ; colnames(dzT2) <- paste0("trial", 1:nWaves)

longData <- rbind(mzT1, dzT1, mzT2, dzT2)


# Make a figure to visualize the how the growth parameters affect the observed means for each wave of your data
linMeans  <- t(linACElgcFit$output$matrices$lgcLinACE.Mean) %*% t(linACElgcFit$output$matrices$lgcLinACE.lambda)
quadMeans <- t(quadACElgcFit$output$matrices$lgcQuadACE.Mean) %*% t(quadACElgcFit$output$matrices$lgcQuadACE.lambda)
semiMeans <- t(semiACElgcFit$output$matrices$semilgcACE.Mean) %*% t(semiACElgcFit$output$matrices$semilgcACE.lambda)

# Generate a blank plot (with the basic parameters that you want)
# Change the parameters of the graph to make it "pretty"
plot(1:6,linMeans, ylab = "DV", xlab = "Wave", ylim = c(0,20), type = "b", 
main = "Trajectories of the Different Growth Models for Dataset 3", lwd = 3, col = "white")

# randomly sample and plot 200 trajectories (you can do more if you want but this gives a good picture IMHO)
for(i in 1:200){
	lines(1:6,longData[sample(1:nrow(longData),1), paste0("trial", 1:nWaves)], ylab = " ", xlab = " ", col = i , lwd = .5)
}

# Plot the means (hint: to them one at a time)
lines(1:6,linMeans, type = "b",  lwd = 3, col = "black")
lines(1:6,quadMeans, type = "b",  lwd = 3, col = "red")
lines(1:6,semiMeans, type = "b",  lwd = 3, col = "blue")

legend("topleft", lwd = 3, 
       legend = c("Linear LGC", "Quadratic LGC", "Semiparametric LGC"), 
	   col = c("black", "red", "blue") )


################################################################################################
###        Pick one of the datasets and interpret the parameters of the best fitting.        ###
################################################################################################

################################################################################################
# Interpreting the Quadratic LGC Model
################################################################################################

selVars <- colnames(lgcData_1)[1:12]
dataMZ <- mzData_1
dataDZ <- dzData_1

# Run the Quadratic ACE LGM model
quadACElgcModel <- quadACElgc(mzData = dataMZ, dzData = dataDZ, selVars  = selVars, nWaves = 6)
quadACElgcFit <- mxRun(quadACElgcModel)
quadACElgcSumm <- summary(quadACElgcFit )
quadACElgcSumm

# The summary of the parameters estimates is very useful for providing an overview of the model and other information to assess the quality of the model. To really understand what is happening, we need to look at the estimates inside the model in more detail. I find it is easier to do that with reference to the specific matrices.

# Means of the latent growth factors 

# How can we get the means for the latent growth factors?

#We can use the following code:
quadACElgcFit$output$matrices$lgcQuadACE.Mean

# This will produce the following means:
#           [,1]
# [1,] 33.149999     # Latent intercept
# [2,] 12.220000     # Latent linear slope
# [3,] -4.529999     # Latent quadratic slope

# How can we calculate the expected means for each waves?
t(quadACElgcFit$output$matrices$lgcQuadACE.Mean) %*% t(quadACElgcFit$output$matrices$lgcQuadACE.lambda)

# How would you interpret these parameters?

#The mean of the latent intercept tells about the average level of the population. This is scaled by the factor loadings so it may not be directly interpretable. If it is multiplied by 0.4082483 (the value of the constant) then you get 13.53343 (which is the mean of the population across all of the waves).

#The mean of the latent linear factor (12.22) suggests that as the time progresses, we see an increase in the observed behavior by multiplying the linear mean by each wave, we can assess how much change is occurring between waves (On average we would observe a -2.92114 difference between waves).

#The mean of the latent quadratic factor (-4.53) indicates that at later waves we will observe less change than we would at earlier waves. This is what gives the function the decreasing curve.

# What is the covariances between the latent intercept and the latent linear growth factor?
mxEval(alat + clat + elat, quadACElgcFit)
#or
quadACElgcFit$output$matrices$lgcQuadACE.alat + quadACElgcFit$output$matrices$lgcQuadACE.clat + quadACElgcFit$output$matrices$lgcQuadACE.elat

# What are the correlations between the latent intercept and the latent linear growth factor?

# cov2cor(mxEval(alat + clat + elat, quadACElgcFit))
#            [,1]        [,2]        [,3]
# [1,]  1.0000000 -0.42787236  0.28908625
# [2,] -0.4278724  1.00000000 -0.09569516
# [3,]  0.2890863 -0.09569516  1.00000000
# 
# cor(int, lin) = -0.4278724

# How would we interpret the correlations between the latent intercept and the linear slope factors?

#The correlation between the intercept and the latent linear growth factor (-0.43) suggest that there is a strong negative relationship between an individuals average level and how rapidly their behavior increases. Thus, the people who start at the highest level appear to increase slower than those who start at a lower level (on average).

# What proportion of the variance in the latent growth factors is accounted for by A, C or E factors for each growth parameter?

mxEval( alat /(alat + clat + elat), quadACElgcFit)
mxEval( clat /(alat + clat + elat), quadACElgcFit)
mxEval( elat /(alat + clat + elat), quadACElgcFit)

#It should become completely obvious that the proportion of variance that we observe for the correlation between the latent linear and quadratic growth factors is absolutely ridiculous. It would imply that A accounts for 110% of the variance and E accounts for -59% of the variance.  

#The intention of this question is to highlight the dubious nature of "coheritabilities" when analyzing data. As Richard Feynman said: "The first principle is that you must not fool yourself and you are the easiest person to fool."

# What proportion of the phenotypic variance accounted for by specific A, C or E factors/residuals


################################################################################################
# Interpreting the Linear LGC Model
################################################################################################

selVars <- colnames(lgcData_2)[1:12]
dataMZ <- mzData_2
dataDZ <- dzData_2

# Run the Linear ACE LGM model
linACElgcModel <- linACElgc(mzData = dataMZ, dzData = dataDZ, selVars  = selVars, nWaves = 6)
linACElgcFit <- mxRun(linACElgcModel)
linACElgcSumm <- summary(linACElgcFit )
linACElgcSumm

# The summary of the parameters estimates is very useful for providing an overview of the model and other information to assess the quality of the model. To really understand what is happening, we need to look at the estimates inside the model in more detail. I find it is easier to do that with reference to the specific matrices.

# Means of the latent growth factors 

# How can we get the means for the latent growth factors?
t(linACElgcFit$output$matrices$lgcLinACE.Mean) 
         [,1] [,2]
[1,] 8.619999 4.39

# How can we calculate the expected means for each waves?
t(linACElgcFit$output$matrices$lgcLinACE.Mean) %*% t(linACElgcFit$output$matrices$lgcLinACE.lambda)

# How would you interpret these parameters?

#The mean of the latent intercept (8.61) tells about the average level of the population. This is scaled by the factor loadings so it may not be directly interpretable. If it is multiplied by 0.4082483 (the value of the constant) then you get 3.5191 (which is the mean of the population across all of the waves). The mean of the latent linear factor (4.39) suggests that as the time progresses, we see an increase in the observed behavior by multiplying the linear mean by each wave, we can assess how much change is occurring between waves (On average we would observe a 1.05 difference between waves).

# What are the covariances between the latent growth factors?
The hint gives you the following covariance matrix: 
mxEval(alat + clat + elat, linACElgcFit) 
#         [,1]       [,2] 
# [1,  ]1.548451   -0.579424 
# [2,] -0.579424    1.008990. 
# Since the question asks for the covariance between the latent intercept and the latent linear growth factor the answer is: -0.58.

# What are the correlations between the latent growth factors?
cov2cor(mxEval(alat + clat + elat, linACElgcFit)) 
      [,1]        [,2] 
[1,] 1.0000000 -0.4635582 
[2,] -0.4635582 1.0000000 

#The correlation between the intercept and the latent linear growth factor (-0.46) suggest that there is a strong negative relationship between an individuals average level and how rapidly their behavior increases. Thus, the people who start at the highest level appear to increase slower than those who start at a lower level (on average).

# How would we interpret the correlations between the latent intercept and the linear slope factors?

# What proportion of the variance in the latent growth factors is accounted for by A, C or E factors for each growth parameter?

# A = 0.4645128 
# C = 0.1806472 
# E = 0.3548400

# The results would imply that about 1/2 of the correlation between the latent intercept and the latent slope can be accounted for by additive genetic factors, with approximately 15% and 36% being accounted for by C and E. In many cases the sign of the A, C and E covariances may be different. In such cases, the interpretation of the "coheritabilities" is dubious. As Richard Feynman said: "The first principle is that you must not fool yourself and you are the easiest person to fool."

# What proportion of the phenotypic variance accounted for by specific A, C or E factors/residuals

# A = 0.4827425 
# C = 0.1551846 
# E = 0.3620729


################################################################################################
# Interpreting the Semiparametric LGC Model
################################################################################################

selVars <- colnames(lgcData_3)[1:12]
dataMZ <- mzData_3
dataDZ <- dzData_3

# Run the Semiparametric ACE LGM model 
semiACElgcModel <- semiACElgc(mzData = dataMZ, dzData = dataDZ, selVars  = selVars, nWaves = 6)
semiACElgcFit <- mxRun(semiACElgcModel)
semiACElgcSumm <- summary(semiACElgcFit )
semiACElgcSumm

# The summary of the parameters estimates is very useful for providing an overview of the model and other information to assess the quality of the model. To really understand what is happening, we need to look at the estimates inside the model in more detail. I find it is easier to do that with reference to the specific matrices.

# Means of the latent growth factors 

# How can we get the means for the latent growth factors?
t(semiACElgcFit$output$matrices$semilgcACE.Mean)
#         [,1]     [,2]
#[1,] 5.107999 7.902001

# How can we calculate the expected means for each waves?
t(semiACElgcFit$output$matrices$semilgcACE.Mean) %*% t(semiACElgcFit$output$matrices$semilgcACE.lambda)

# How would you interpret these parameters?

# The mean of the latent intercept (5.11) tells about the average level of the population. This is scaled by the factor loadings so it may not be directly interpretable. If it is multiplied by 1 (the value of the constant) then you get 5.11 (which is the mean of the population at the for wave 0 - which is before we started recording the data). The mean of the latent linear factor (7.90) suggests that as the time progresses, we see an increase in the observed behavior by multiplying the linear mean by each wave, we can assess how much change is occurring between waves (On average we would observe a 4.43 difference between waves). However, as we have a big jump in factor loadings, the "on average" may not reflect anything all that relevant to the study. Thus, we should interpret these factors with deference to the pattern of means that we observe and fit.

# What are the covariances between the latent growth factors?
mxEval(alat + clat + elat, semiACElgcFit) 
       [,1]     [,2] 
[1,] 3.121273 -2.495901 
[2,] -2.495901 3.269126 
# Because the question asked about the covariance between the intercept and the slope the answer is really: -2.45.

# What are the correlations between the latent growth factors?
cov2cor(mxEval(alat + clat + elat, semiACElgcFit)) 
         [,1]      [,2] 
[1,] 1.0000000 -0.7813499 
[2,] -0.7813499 1.0000000 

# How would we interpret the correlations between the latent intercept and the linear slope factors?

#The correlation between the intercept and the latent linear growth factor (-0.78) suggest that there is a strong negative relationship between an individuals average level and how rapidly their behavior increases. Thus, the people who start at the highest level appear to increase slower than those who start at a lower level (on average).


# What proportion of the variance in the latent growth factors is accounted for by A, C or E factors for each growth parameter?
mxEval( alat /(alat + clat + elat), semiACElgcFit) 
mxEval( clat /(alat + clat + elat), semiACElgcFit) 
mxEval( elat /(alat + clat + elat), semiACElgcFit)

# A = 0.4660201 
# C = 0.1787124 
# E = 0.3552676

# What proportion of the phenotypic variance accounted for by specific A, C or E factors/residuals

# A = 0.4611064 
# C = 0.1858689 
# E = 0.3530247

#The results would imply that about 50% of the correlation between the latent intercept and the latent slope can be accounted for by additive genetic factors, with approximately 15% and 36% being accounted for by C and E. In many cases the sign of the A, C and E covariances may be different. In such cases, the interpretation of the "coheritabilities" is dubious. As Richard Feynman said: "The first principle is that you must not fool yourself and you are the easiest person to fool."

# How would you interpret the factor loadings of the semiparametric model?

mxEval( lambda, semiACElgcFit) 
#        [,1]     [,2] 
#  [1,]   1    1.000000 
#  [2,]   1    1.555556 
#  [3,]   1    2.111111 
#  [4,]   1    4.888889
#  [5,]   1    5.444444 
#  [6,]   1    6.000000 

# As you can see there is a general linear increase in the factor means from waves 1 to waves 3, and between waves 4 and wave 6 with a big jump between wave 3 and 4. This pattern of factor loadings mirrors what we seen in the observed and expected means of the variables.