#
# File selonxOpenMx.R
#
# Simulate data to illustrate that maximum likelihood is robust to
# certain types of missing data, particularly where missingness is 
# predicted by measures in the dataset that are not missing.
# Here we will explore what happens to the estimates 
# when the second variable (Y) is missing whenever the first variable 
# (X) is greater than zero
# 
# M C Neale March 6 2020 
#
# Load Libraries

library("MASS")
library(OpenMx)
library(rgl)
library(fMultivar)
library(car)
library(plot3D)

# Simulate bivariate data r=.5
set.seed(5692148)
r <- 0.5	# Correlation (r)
N <- 1000	# Number of pairs of scores to simulate
Sigma	<- matrix(c(1,r,r,1),2,2)
Mu 		<- c(0,0)
pairs 	<- mvrnorm(N, Mu, Sigma)
varnames <- c("X","Y")
dimnames(pairs) <- list(NULL,varnames)

# Make a copy of the data for later
pairsOriginal <- pairs

# Take a look at complete data
cov(pairsOriginal)
colMeans(pairsOriginal, na.rm=T)
scatter.smooth(pairsOriginal)

# Now to make some of the **Y scores missing due to value of X**
# This is known as Missing At Random (MAR) in Little & Rubin (1987) sense
#########  CASE 1: Y is missing iff X > 0 "SelonX" #########
pairs[pairs[,1]>0, 2] <- NA
scatter.smooth(pairs)
# The plot doesn't show the unmatched X observations

# Nifty, that, now let's look at some summary statistics of the observed data
head(pairs)
colMeans(pairs, na.rm=T)
cov(pairs,use='complete.obs')
cor(pairs,use='complete.obs')
# So complete data only have funny summary statistics

# Set up OpenMx model to estimate variances, covariance and means (it's a saturated model)
sigma 	<- mxMatrix("Symm", 2, 2, values=diag(2), free=T, name="sigma")
mu 		<- mxMatrix("Full", 1, 2, free=T, name="mu")
obj 	<- mxFitFunctionML();
expec	<- mxExpectationNormal(covariance="sigma", mean="mu", dimnames=varnames)
data 	<- mxData(pairs, type='raw')
missMod	<- mxModel("MAR SelonX Missing on Y due to X values", sigma, mu, obj, expec, data)

# Fit model and look at output
summary(missModRun <- mxRun(missMod), refModels=mxRefModels(missMod, run = TRUE))
mxEval(cov2cor(sigma),missModRun)

# Should see that despite being given data with sample means away from zero,
# 	and sample variances away from 1.  How do the ML estimates look?

# Here we compare the results of the simulation-recover exercise
# to the values used for the simulation (subModel) and look at the chi-squared
# test.  To evaluate the performance of a model, this step could
# be looped and compared to the chi-sq with 5 df repeatedly
subModel <- mxRename(missModRun,"missModSub")
subModel$sigma$values[]<-matrix(c(1,r,r,1),2,2)
subModel$sigma$free[]<-F
subModel$mu$values[]<-0
subModel$mu$free[]<-F
subModelFit <- mxRun(subModel)
mxCompare(missModRun, subModelFit)

#########  CASE 2: Y is missing iff X > 0 but analyze complete pairs #########
# Now what would happen if we foolishly analyzed only the complete pairs?
data2 <- mxData(pairs[!is.na(pairs[,2]),], type='raw')
missMod2 <- mxModel("Missing on Y due to X selonX but Complete Pairs Only", sigma, mu, obj, expec, data2)
summary(missModRun2 <- mxRun(missMod2), refModels=mxRefModels(missMod2, run = TRUE))
mxEval(cov2cor(sigma),missModRun2)


#########  CASE 3: Y is missing iff **Y** > 0  "SelonY" #########
# Now to make some of the **Y scores missing due to value of Y**
pairsMissOnY <- pairsOriginal
pairsMissOnY[pairsMissOnY[,2]>0,2] <- NA
data3 <- mxData(pairsMissOnY, type='raw')
missMod3 <- mxModel("NMAR Missing on Y due to Y values selonY", sigma, mu, obj, expec, data3)
summary(missModRun3 <- mxRun(missMod3), refModels=mxRefModels(missMod3, run = TRUE))
mxEval(cov2cor(sigma),missModRun3)

# What do you notice about these ML estimates?  

# Let's plot the data

pairsNA3 <- pairs
pairsNA3[is.na(pairsNA3[,2]),2]<--4
heights <- dnorm2d(pairsOriginal[,1], pairsOriginal[,2], rho = cor(pairsOriginal)[2,1])
scatter3D(pairsNA3[,1],pairsNA3[,2], heights, phi = 40, theta = 90)

# And plot it so we can manipulate it
plot3d(pairsNA3[,1],pairsNA3[,2], heights, rho = cor(pairsOriginal)[2,1],phi = 40, theta = 90)

#########  CASE 4: Y is missing iff X > 0  but FIX x mean to 0 #########
missModFixedMean <- missMod
missModFixedMean$mu@values[1,1] <- 0
missModFixedMean$mu@free[1,1] 	<- F
missModFixedMean
summary(missModFixedMeanFit <- mxRun(missModFixedMean))
mxEval(cov2cor(sigma),missModFixedMeanFit)


# Conclusion: incorrect estimate of mean of X causes bias in other parameter estimates