#####################################################################
# Program: IntroToR2018.R
# Author: Elizabeth Prom-Wormley
# Date: March 2, 2018
# 
# Questions???
# Just ask in person or virtually by e-mail 
# Elizabeth.Prom-Wormley@vcuhealth.org
#
#
# This script will help you learn enough R basics to get 
# the most of this week. 
#
#
# This header is a comment.  Any text after a hash mark (#)
# is ignored by R until the next carriage return.  So, if you
# put a hash mark at the beginning of a line, the whole line
# is ignored.
#
# This script is formatted for a text window that is 72 columns wide.
#
# As we go through this file, you are welcome to write your
# own comments so that you can come back to this file and
# remind yourself of what was happening. These can also serve as
# class notes.
#
# We encourage you to make these notes in all of the scripts
# in the workshop.  It will be easier to come back later
# and modify them for your own use.	
#
# Learning goals for this script/lecture are as follows:
# 
# 1- Find/open files
# 2- Learn how R works and thinks about files/data
# 3- How to use R to get a feel for the data	
# 4- Basic data management 			
# 5- Summary Statistics
# 6- Summarizing results for use outside of R
# 7- Introduction to functions
####################################################################

####################################################################
# 1- OPENING A FILE
####################################################################

# First, we have to know where to find the file
getwd()

# PC Users
setwd('/home/elizabeth/2018')

#Mac users
setwd('/Users/ecpromwormle/Documents/MyDocuments/OZclass2016')

getwd()

# Now, we can open the file
YTD <- read.table(file = "twinData.csv", header = TRUE, sep = ",")
# Translation
# We just made an object called "YTD" that came from reading a file   
# called "YoungTwinData.csv" that has variables names (header=TRUE)   
# and is comma delimited (sep=",")


####################################################################
# 2- LEARNING HOW R THINKS/WORKS WITH DATA
####################################################################

# First, know your data structure
# Why should you care?
class(YTD)


# There are four typical classes of data structures: data frames,
# matrices, vectors, and lists.  Your need for a specific data 
# structure typically depends on the function you are using.  
# We will focus most of our time this week using data frames.
# We will discuss a bit over the next couple of lines
# For more information on data types visit
# http://adv-r.had.co.nz/Data-structures.html


# DATA FRAME- Most general data structure because you can have many 
# different types of data (ie: one variable is character and another
# is numeric)
head(YTD)
is.data.frame(YTD)

# LIST- A collection arbitrary objects (can be of different 
# types) under a single name data.frames have the structure of 
# a type of list. Requirement that all list elements are
# of the same length.
alist <- list(one= 1:100, two= c("a","b",LETTERS[18]), three= pi)
class(alist)
is.list(alist)

# MATRIX- All elements of a matrix have the same mode 
# (numeric or character). They allow for efficient mathematical
# operations and manipulation not available for data frames.                                              
head(YTDM2)
is.matrix(YTDM2)

# VECTOR- One long row of data.  Allows for efficient operations
# not available for data frames.
YTD[,3]
is.vector(YTD[,3])
ls()

# Removing an object (or two)
rm(list=c("alist"))


####################################################################
# 3- GETTING A FEEL FOR THE DATA 
####################################################################
	
# How do we know what's in that data frame?
dim()
names()
head()
tail()
str()
summary()


# What if we want to look at one specific variable (age)?

YTD$age1

# We could also use the index number assigned to that variable
YTD[7]

# This works too
YTD[, 7]

# What does this do?
YTD[7, ]


####################################################################
# 4- BASIC DATA MANAGEMENT 
####################################################################


# Frequencies 
# Coding note 1 - zyg is coded as follows: 1=MZf, 2=MZm, 3=DZf, 4=DZm, 5=DZfm, 6 = DZmf
table(YTD$zyg, useNA ="always")


# Can also use the CrossTable function in descr
install.packages("descr")
require(descr)
# Translation- In the two lines above we installed a package called descr.  
# There are many packages in R to allow you to do many things and it would
# take too much memory to download all the packages. So, you can just open one 
# of interest.  In this case, gmodels.  It has a 


CrossTable(x = as.factor(YTD$zyg), format = "SAS", missing.include = TRUE)

# Coding note 2 - sex is coded as follows: XXXX 

# CLASS TASK- Figure out how sex is coded.
CrossTable(x = as.factor(YTD$zyg), y = as.factor(YTD$sex1), format = "SAS", missing.include = FALSE)


# Recoding variables (object elements)
install.packages("car")
require(car)
YTD$zyg2 <- recode(YTD$zyg,"1 = 'MZ'; 2 = 'MZ' ; 3 = 'DZ'; 4 = 'DZ'; 5 = 'DZ'; 6='DZ' ")


# Transforming variables
YTD$age1t <- log10(YTD$age1)
YTD$age2t <- log10(YTD$age2)


# Creating new variables 
# Two Categories

quantile (YTD$mem1, probs = 0.9) 
quantile (YTD$mem2, probs = 0.9) 
YTD$memBin1 <- ifelse(YTD$mem1 > 1.247992, yes = 1, no = 0)
YTD$memBin2 <- ifelse(YTD$mem2 > 1.287143, yes = 1, no = 0)

# Three Categories
quantile (YTD$mem1, probs = 0.9) 
quantile (YTD$mem1, probs = 0.75) 

YTD$MemCat3t1[YTD$mem1 <= 0.6753861] <- 0
YTD$MemCat3t1[0.6753861< YTD$mem1 & YTD$mem1 <= 1.247992 ] <- 1
YTD$MemCat3t1[YTD$mem1 > 1.247992 ] <- 2


# Sorting by ord1
YTDsorted <- YTD[order(YTD$ord1), ]
head(YTDsorted)
print(YTDsorted$ord1)


# Selecting Variables
YTDsmall <- YTD[c(1,2,3,4,5,6,7,8,12,13)]


# what if you wanted all variables except for zygosity?
YTDno2 <- YTD[-c(2)]


# Selecting Observations
YTD8 <- YTD[which(YTD$age1 >= 8), ]
# What if we wanted to add another filter for selection?


# Logical Operator Review
# & and
# | or
# == equal to
# != not equal to
# <= less than or equal to
# >= greater than or equal to
# (x >= 0) & (x<=1)  x is between zero and one


# Subsetting-- what are we doing here?
Vars      <- 'mem'
nv        <- 1      
ntv       <- nv*2    
selVars   <- paste(Vars, c(rep(1,nv), rep(2,nv)), sep = "")   

mzData    <- subset(YTD, zyg == 1, selVars)
dzData    <- subset(YTD, zyg == 3, selVars)


####################################################################
# 5a- GETTING TO KNOW THE DATA- SUMMARY STATISTICS
####################################################################

# Basic summary statistics
summary()
mean()
var()
cov()
cor()
# Also polycor package for polychoric, polyserial correlations
# See- http://cran.r-project.org/web/packages/polycor/polycor.pdf


####################################################################
# 5b- GETTING TO KNOW THE DATA- GRAPHING
####################################################################

hist(YTD$mem1, col = "gray", xlab = "Measure", ylab = "Weight", 
main = "Distribution of Weight for Twin 1")
plot(mzData$mem1, mzData$mem2, col = 'red', pch = 2, main = "MZ Correlations for MEM")
plot(dzData$mem1, dzData$mem2, col = 'blue', pch = 3, main = "DZ Correlations for MEM")
qqnorm()
qqline()
boxplot(YTD[c()])


###  Graphing the same thing over several variables ###
par(mfcol=c(1,3))

for (i in 9:11){
hist(YTD[, i], main= paste(c("Histogram","of", names(YTD[i])), 
sep = " "), probability = TRUE, col = "lightblue")

d1<-density(YTD[,i], na.rm = TRUE)
lines(d1, lty = 2, lwd = 3)
}

# Why did I  use both"YTD[,i]" and  "YTD[i]" in the 
# prior example?


# Putting it All Together

# CLASS TASK- Advanced Option
# 1- Calculate twin correlations for MZ and DZ pairs for mem
# 2- Plot the correlations as barplots


# CLASS TASK ANSWERED- Calculating Twin Correlations by Zygosity #
par(mfcol = c(1,1))
rMZ <- cor(mzData$mem1, mzData$mem2, use = "pairwise.complete.obs")
rDZ <- cor(dzData$mem1, dzData$mem2, use = "pairwise.complete.obs")

# What is happening here? What is cbind()?#
corrs <- cbind(rMZ, rDZ)
rbind()

# Setup overall background to make transfer to slides prettier
par(bg = "white")

# Making barplot of twin correlations and putting a box around figure
barplot(corrs, ylim = c(0,1), col = c("cyan"), cex.axis = 2, cex.names = 2, space = 0.1)
box()


# Putting it all together- Reporting Results
# CLASS TASK- Advanced Option
# 1- Calculate means, variances, and covariances of mem
# for each twin by zygosity (ie: mean of mem1 for MZ pairs)
# 2- Aggregate MZ results in a single object
# 3- Ouptut MZ results to a comma delimited (.csv) file for 
# future use in another program


## CLASS TASK ANSWERED 
xmem1mz <- mean(x = mzData$mem1, na.rm = TRUE)
xmem2mz <- mean(x = mzData$mem2, na.rm = TRUE)
xmem1dz <- mean(x = dzData$mem1, na.rm = TRUE)
xmem2dz <- mean(x = dzData$mem2, na.rm = TRUE)

varmem1mz <- var(x = mzData$mem1, na.rm = TRUE)
varmem2mz <- var(x = mzData$mem2, na.rm = TRUE)
varmem1dz <- var(x = dzData$mem1, na.rm = TRUE)
varmem2dz <- var(x = dzData$mem2, na.rm = TRUE)

mzcov <- cov(x = mzData$mem1, y = mzData$mem2, 
             use = "pairwise.complete.obs")
dzcov <- cov(x = dzData$mem1, y = dzData$mem2, 
             use = "pairwise.complete.obs")

####################################################################
# 5- USING RESULTS OUTSIDE OF R
####################################################################

# Result summaries for general summary statistics
results <- round(c(xmem1mz, varmem1mz, mzcov),2)
names(results) <- c("meanmemt1MZs", "varmemt1MZs" , "memmzcov")
write.table(x = results, file = "MZresults.csv", sep = ",", 
            col.names = TRUE, row.names = FALSE)

# Result summaries for frequencies


# Advanced Option
# 1- Make a frequency table of ord1 by sex1 and report the number of people in each
# age category.  Report both N and %.
#
# 3- Output results into a comma delimited file (.csv) for use 
# in another program (ie: Excel) for reporting


# One approach to the frequency problem from above
AgeFreq <- CrossTable(x = as.factor(YTD$agecat), format = "SAS", 
missing.include = TRUE)

FreqOut1 <- c(AgeFreq$t[1], round(AgeFreq$prop.row[1],2))
FreqOut2 <- c(AgeFreq$t[2], round(AgeFreq$prop.row[2],2))
FreqOut3 <- c(AgeFreq$t[3], round(AgeFreq$prop.row[3],2))

Freqs <- as.data.frame(rbind(FreqOut1, FreqOut2, FreqOut3))
names(Freqs) <- c("N", "Percent")
Freqs$AgeCat <- c("< 20", "20-27", "> 28")
Freqs2 <- Freqs[c(3,1,2)]
write.table(x = Freqs2, file = "Freqs2.csv", sep = ",", 
col.names = TRUE, row.names = FALSE)


# Saving Graphics
# One way is simply copy and paste.  Another is to output to 
# another file (ie: PDF)

pdf(file="TwinCorrelations.pdf")

# Setup overall background to make transfer to slides prettier
par(bg = "white")
par(mfcol=c(1,2))

# Making barplot of twin correlations and putting a box around figure
barplot(corrs, ylim = c(0,1), col = c("cyan"), cex.axis = 2, 
cex.names = 2, space = 0.1)
box()

dev.off()


# Just for your knowledge.  
# Removing objects in your working directory
# Be careful when you run this.
# ls()
# rm(list=ls())
# ls()


####################################################################
# 5- BASIC FUNCTION DEVELOPMENT
####################################################################

# Function refresher- A function is previously developed code that 
# the user invokes to do something that they may do often.

# Example- Running a simple correlation
cor(YTD$wt1, YTD$wt2, use = "pairwise.complete.obs", method = 'pearson')

# But what does cor() do, really??
?cor()
cor

# What if you wanted to write your own function to do your own thing?

# Data Preparation Step
YTDt1 <- YTD[c(1,2,3,6,12)]


YTDt2 <- YTD[c(1,2,4,7, 13)]

names(YTDt1) <- c("ID", "zyg", "sex", "age", "mem")
names(YTDt2) <- c("ID", "zyg", "sex", "age", "mem")

YTDt1$twinid <- 1
YTDt2$twinid <- 2

PopData <- rbind(YTDt1, YTDt2)


# When analyzing twin/family data it is necessary to change data 
# configuration so that each twin/family exists on its own line
# Steps requried (focusing on twin data)
# 1- Make a twin one object (twin1)
# 2- Make a twin two object (twin2)
# 3- Merge twin one and twin two objects by family number and provide an 
#    indicator on a variable name that demonstrates whether the variable 
#    belongs to twin one or twin two

# Try it yourself!
  datT1 <- PopData[PopData$twinid == min(PopData$twinid),]
  datT2 <- PopData[PopData$twinid == max(PopData$twinid),]
  
  datTP <- merge(datT1, datT2, by = c("ID", "age", "zyg"), 
  		all.x = TRUE, all.y = TRUE, suffixes = c("_T1","_T2"))
  datTP$twinid_T1 <- NULL
  datTP$twinid_T2 <- NULL

# This type of work is done so frequently, it might be worth making it more 
# generic to be able to do it whenever you want...or make a function
#
# NOTE: This is a very BASIC twin merging function and assumes complete
# data for all twins on the variables for which you are merging
# You will need to modify this to address issues such as missing zygosity
# on one twin or another
#
#
#
# Define your function 
# We write the line that you will use to invoke a function called "twindat" 
# and defining all the necessary pieces of your function to make it work.
# Also indicate that you have begun building the function, using
# curly bracket- { 
	
twindat <- function(dat, famid, twinid, zyg) {
	# Function step 1- pull out Twin 1 and Twin 2 specific objects	
	datT1 <- dat[dat[,twinid] == min(dat[,twinid]),]
  datT2 <- dat[dat[,twinid] == max(dat[,twinid]),]
  
  	# Function step 2- merge Twin 1 and Twin 2 objects by family id 
	DAT <- merge(datT1, datT2, by = c(famid), all.x = TRUE, all.y = TRUE, suffixes = c("_T1","_T2"))
	
    # Function step 3- output an object that contains the newly merged data 
    # Also indicate end of function with curly bracket- }
  return(DAT)
}

# Use your function
NewTwinData <- twindat(dat = PopData, famid = "ID", twinid = "twinid", zyg = "zyg")

# How could I modify this function?


####################################################################
# RESOURCES  
####################################################################


# 1- R for SAS and SPSS Users 
# A great handbook if you come from either of these programs and need 
# some help in translating into R
# The following is a link to a pdf file with a shorter version of 
# the hard copy book
# https://science.nature.nps.gov/im/datamgmt/statistics/R/documents/R_for_SAS_SPSS_users.pdf

# 2- An R Style Guide
# There are a few rules that help make it easier for you/others to  
# follow your R code.  Suggestions are provided here
# http://google-styleguide.googlecode.com/svn/trunk/Rguide.xml

# 3- RSeek
# A Google search engine for all things R related
# http://www.rseek.org/

# 4- R-Bloggers
# R-Bloggers.com is a central hub (e.g: A blog aggregator) of content    
# collected from bloggers who write about R (in English).
# You can receive post updates by Facebook or daily e-mail
# http://www.r-bloggers.com/

# 5- Quick-R's section on graphics
# Good starting place  to get some pointers for improving your figures
# http://www.statmethods.net/advgraphs/parameters.html

# 6- More information on data frames
# http://www.r-bloggers.com/exploratory-data-analysis-useful-r-functions-for-exploring-a-data-frame/

# 7- Details on how to develop your own package
# http://r-pkgs.had.co.nz/

# 8- Details on apply(), sapply(), and lapply()
# http://www.r-bloggers.com/using-apply-sapply-lapply-in-r/