################################################
##        Twin-Based Introduction to R        ##
################################################
## This link provides some good introductory explanations to 
## compare base R graphics 
## and ggplot2 graphics which you will be learning later in the course.
## http://seriousstats.wordpress.com/2012/09/05/highqualitygraphs/
## https://www.datanovia.com/en/blog/top-r-color-palettes-to-know-for-great-data-visualization/#viridis-color-palettes
## Questions? 
## email- Elizabeth.Prom-Wormley@vcuhealth.org

# Address for Boulder Workshop
setwd('/home/elizabeth/2021/Working with Twin Data in R')

# Code to install packages.  You only need to do this once.
#install.packages('psych', dependencies=TRUE)
#install.packages('gplots', dependencies=TRUE)
#install.packages('car', dependencies=TRUE)
#install.packages('descr', dependencies=TRUE)
#install.packages('gmodels', dependencies=TRUE)
#install.packages("RColorBrewer")
#install.packages('metafor', dependencies=TRUE)

# Code to run packages in a given session. 
# You need to do this every time you have a new R session.

library(psych)
library(car)
library(gmodels)
library(descr)
library(gplots)
library(RColorBrewer)

# Setting working directory. Your address will be different.
# Mac address
setwd('/Users/ecpromwormle/Documents/MyDocuments/Boulder/Boulder2021')

# PC address example
# setwd('C:\\Users\\cliffordjs\\Desktop\\R Training Scripts')

twins<- read.table(file='tw19long_v3.txt',header=T,sep="\t")
#twins2<- read.table(file='tw19wide_v3.txt',header=T,sep="\t")
## Review- What are ways we are able to get preliminary information 
## on our dataset/variables?

dim(twins)
head(twins)
names(twins)
summary(twins)
str(twins)

# Frequency Tables using descr pacakage
CrossTable(twins$SEX)
#CrossTable(twins$ZYG2)
#CrossTable(twins$vB2bin, twins$SEX)


# Turn on PDF device
#pdf(file="ClassGraphs.pdf")

# Plot one histogram of vA1con
hist(twins$vA1con, col="gray")

#Plot continuous measures of interest all at the same time
par(mfcol=c(3,3))
for (i in c(6,8,11,14,18,19,20,21,22)){
  hist(twins[,i],main= paste(c("Histogram", "of",names(twins[i])),sep=" "),
       probability=TRUE,col= "gray", 
       xlab= paste("mean",c(names(twins[i])), "=", 
       round(mean(twins[,i],na.rm=TRUE),2)))
  d1<-density(twins[,i])
  lines(d1)
}

for (i in c(6,8,11,14,18,19,20,21,22)){
  qqnorm(twins[,i], 
         main= paste(c("QQ Plot", "of",names(twins[i])),sep=" "),
         xlab= paste(c(names(twins[i]))))
}

### Using boxplots
boxplot(twins$vA1con~twins$SEX, col="gray")
boxplot(twins$vA1con~twins$ZYG2, col="gray")

for (i in c(6,8,11,14,18,19,20,21,22)){
  boxplot(twins[,i]~twins$ZYG5,
          main= paste(c("Boxplot", "of", names(twins[i]), 
                        "by Zygosity"),sep=" "), col="gray")
}

## LET'S PLAY!
## How would you alter the previous script to visualize 
## whether there are differences by zygosity groups?


# Turn off PDF device
#dev.off() 


###  Moving on to working with twin data ###
## Get twin data in the right format for future use ##

## We will do this step-by-step first. 
## Then, we will automate with a function.
## Last, we will share function that automates completely

## 1- Separate original object into two objects: 
## A Twin 1 object and a Twin 2 object
datA <- twins[twins[,"TWID"]==min(twins[,"TWID"]),]  #twin1 object
datB <- twins[twins[,"TWID"]==max(twins[,"TWID"]),]  #twin2 object

## 2- Merge Twin 1 and Twin 2 objects by a Family ID ("FAMID") 
## that is common to both
## 3- During Merge, ensure that data are differentiated between 
## Twin 1 and Twin 2 (e.g., "VariableName1_T1 and VariableName1_T2")

DAT <- merge(datA, datB, by="FAMID", all.x=TRUE, all.y=TRUE, 
             suffixes=c("_T1","_T2"))

## 4- Remove data that is "unnecessary". 
## TWID is no longer necesssary since we have indicated which data come 
## from twin 1 and which come from twin 2
DAT[,paste("TWID","_T1",sep="")] <- NULL
DAT[,paste("TWID","_T2",sep="")] <- NULL

## Make a single zygosity variable (ZYG2, ZYG5, ZYG6) 
## rather than one for each twin.  
## All of these different approaches do the same thing. 
DAT$ZYG5 <- ifelse(is.na(DAT$ZYG5_T1),
                   DAT$ZYG5_T2,
                   DAT$ZYG5_T1)

DAT[,"ZYG2"] <- ifelse(is.na(DAT[,"ZYG2_T1"]),
                         DAT[,"ZYG2_T2"],
                         DAT[,"ZYG2_T1"])
zygosity<- "ZYG6"
DAT[,zygosity] <- ifelse(is.na(DAT[,paste(zygosity,"_T1",sep="")]),
                       DAT[,paste(zygosity,"_T2",sep="")],
                       DAT[,paste(zygosity,"_T1",sep="")])

## Removing Twin1 and Twin2 Zygosities
DAT$ZYG2_T1<- NULL
DAT$ZYG2_T2<- NULL
DAT[,"ZYG5_T1"] <- NULL
DAT[,"ZYG5_T2"] <- NULL
DAT[,paste(zygosity,"_T1",sep="")] <- NULL
DAT[,paste(zygosity,"_T2",sep="")] <- NULL

## Automate by putting together a twin-focused function.
## Put twin data all on one line using twin function.
#INPUT:  data.frame where each data for each individual is listed in separate record
#OUTPUT: data.frame where twin pairs are listed in a single record
twindat <- function(dat, famid, twinid, zygosity1, zygosity2, zygosity3) {
  datA <- dat[dat[,twinid]==min(dat[,twinid]),]    #twin1
  datB <- dat[dat[,twinid]==max(dat[,twinid]),]    #twin2 
  
  DAT <- merge(datA, datB, by=famid, all.x=TRUE, all.y=TRUE, 
               suffixes=c("_T1","_T2"))
  
  DAT[,paste(twinid,"_T1",sep="")] <- NULL
  DAT[,paste(twinid,"_T2",sep="")] <- NULL
  
  DAT[,zygosity1] <- ifelse(is.na(DAT[,paste(zygosity1,"_T1",sep="")]),
                           DAT[,paste(zygosity1,"_T2",sep="")],
                           DAT[,paste(zygosity1,"_T1",sep="")])
  DAT[,paste(zygosity1,"_T1",sep="")] <- NULL
  DAT[,paste(zygosity1,"_T2",sep="")] <- NULL
  
  DAT[,zygosity2] <- ifelse(is.na(DAT[,paste(zygosity2,"_T1",sep="")]),
                            DAT[,paste(zygosity2,"_T2",sep="")],
                            DAT[,paste(zygosity2,"_T1",sep="")])
  DAT[,paste(zygosity2,"_T1",sep="")] <- NULL
  DAT[,paste(zygosity2,"_T2",sep="")] <- NULL
  
  DAT[,zygosity3] <- ifelse(is.na(DAT[,paste(zygosity3,"_T1",sep="")]),
                            DAT[,paste(zygosity3,"_T2",sep="")],
                            DAT[,paste(zygosity3,"_T1",sep="")])
  DAT[,paste(zygosity3,"_T1",sep="")] <- NULL
  DAT[,paste(zygosity3,"_T2",sep="")] <- NULL
  return(DAT)
} 

# Four arguments needed for twindat() function
Twins2 <- twindat(dat=twins, famid="FAMID", twinid="TWID", 
                zygosity1="ZYG2", zygosity2="ZYG5", zygosity3="ZYG6")


# Another way is by using the reshape function 

Twins2a <- reshape(data = twins, direction="wide", 
                   idvar = c('FAMID','ZYG2','ZYG5','ZYG6'), 
                   timevar = "TWID", sep = "_")
write.table(Twins2a, file = "Twins2a.csv", row.names=FALSE, 
            col.names=TRUE, sep = ",")

## REVIEW
## We identified 3 ways to prepare twin data for further analysis.
## 1- A self-coded step-by-step approach
## 2- An automated approach using a function that you personalize
## 3- An already available function
## What you decide may depend on how dirty your data are.  


## WORKING WITH TWIN DATA ##
## Let's start getting a feel for the data
dim(Twins2)
names(Twins2)
head(Twins2)

MZ <- subset(Twins2,Twins2$ZYG2==1)
DZ <- subset(Twins2,Twins2$ZYG2==2)

# Scatterplot of twin relationships #
par(mfcol=c(1,1))
plot(MZ$vA1con_T1,MZ$vA1con_T2, col="black")
plot(DZ$vA1con_T1,DZ$vA1con_T2, col="black")
plot(MZ$AGE_T1,MZ$AGE_T2, col="red")
plot(DZ$AGE_T1,DZ$AGE_T2, col="blue")

# Adding a loess curve 
par(mfcol=c(1,1))
pairs(MZ[c(4,29)],panel=panel.smooth, main="Scatterplot of MZ pairs")
pairs(DZ[c(4,29)],panel=panel.smooth, main="Scatterplot of DZ pairs")


##############################
# Graphing Twin Correlations #
##############################

# Calculating Twin Correlations by Zygosity #
rMZ<-cor(MZ$vA1con_T1,MZ$vA1con_T2,use="pairwise.complete.obs")
rDZ<-cor(DZ$vA1con_T1,DZ$vA1con_T2,use="pairwise.complete.obs")

# Combining results into one object #
Corr2<-cbind(rMZ, rDZ)
write.table(Corr2,file="corrs.csv",sep=",",row.names=FALSE,col.names=FALSE)

#setup overall background to make transfer to slides prettier
par(bg="white")

#making barplot of twin correlations and putting a box around figure
barplot(Corr2,ylim=c(0,1),col=c("cyan"),
        cex.axis=1.5,cex.names=1.5,space=0.1)

# Same barplot with a color palette other than Base R
# via the RColorBrewer package.  See below for more 
# code to get started with RColorBrewer
#barplot(Corr2[c(1,2)],ylim=c(0,1),
#        col=c("#1D91C0","#41B6C4"),
#        cex.axis=1,cex.names=1,space=0.1)
#legend("topright",legend=c("rMZ", "rDZ"), pch=15, 
#       pt.cex=1.5,col=c("#1D91C0","#41B6C4"))

#box()