# ---------------------------------------------------------------------
# Program: TW-IntroToR-20100210.R
#  Author: Matt Keller, Steve Boker, Michael Spiegel
#    Date: Mon Mar 5 09:12:21 EST 2012
#
# WELCOME TO THE WONDERFUL WORLD OF R!
#
# This is an introductory script for learning the very basics
#   of R.  This header is a comment.  Any text after a hash mark (#)
#   is ignored by R until the next carriage return.  So, if you
#   put a hash mark at the beginning of a line, the whole line
#   is ignored.
#
# This script is formatted for a text window that is 72 columns wide.
#
# As we go through this file, we encourage you to write your
#   own comments so that you can come back to this file and
#   remind yourself of what was happening. 
#
# We encourage you to make these notes in all of the scripts
#   in the workshop.  It will thus be easier to come back later
#   and modify them for your own use.
#
# ---------------------------------------------------------------------
# Revision History
#   Matt Keller & Steve Boker -- Wed Feb 10 12:02:11 EST 2010
#      Created TW-IntroToR-20100210.R.
#   Keller, Boker, & Spiegel -- Mon Mar 5 09:13:12 EST 2012
#      Revision to more closely support the Twin Workshop curriculum.
#
# Note: I always put a revision history in all of my scripts!
# ---------------------------------------------------------------------

# ---------------------------------------------------------------------
# Variables  (I always have a section here describing my variables)
# ---------------------------------------------------------------------
#
# Fred -- Fred is a variable that has the numeric value 4
#
# TwinExample1 -- simulated data frame
# TwinExample1$ID       -- ID number of twin pairs
# TwinExample1$Zygosity -- "MZ" or "DZ"
# TwinExample1$Height1  -- Height of twin 1
# TwinExample1$Weight1  -- Weight of twin 1
# TwinExample1$Height2  -- Height of twin 2
# TwinExample1$Weight2  -- Weight of twin 2
#
# ---------------------------------------------------------------------


# ---------------------------------------------------------------------
# PART 1: USING R AS A CALCULATOR
# ---------------------------------------------------------------------

# Arithmetic with two numbers
2+2

2-4

2*3

8/3

# If you're like me, you don't remember order of operations
9+12*3

# So use parentheses to be explicit about what you want!
(9+12)*3

# 3 squared
3^2

# 9 to the 1/2 power
9^.5

# That is the same as what is returned by the square root function
sqrt(9)

# In R you can make your own functions.
mySqrt <- function(x) { x^.5 }

# Now we can use it
mySqrt(9)


# ---------------------------------------------------------------------
# PART 2: ASSIGNMENT AND OBJECT CREATION
# ---------------------------------------------------------------------

# Let's make a variable called Fred
Fred <- 2+2

# Congrats! You've just made your first variable.
# We say, "Fred gets two plus two". The arrow (less than, dash)
# "assigns" 2+2 to Fred. You'll use the "gets arrow" all the time!

# Type Fred on a line by itself and press return and you find out
# what is contained in Fred
Fred

# Caps matter. There is no object "fred". Your first error message!
fred

# Error messages are our friends :) They give us tough love.


# One variable can be assigned to another.
a <- Fred

# Now 'a' is a copy of Fred.
a

# We read the next line as "A gets square root of 'a' times five"
A <- sqrt(a)*5
A

# This one is a little tricky.  We can re-assign A.
A <- A*2
A

# Here, we create a vector of length 13; notice what ":" does
x2 <- 1:13
x2

# The "*" does element-wise multiplication.
x2*3

# Since there are fewer elements in "3" than in "x2", "3" is 
#   repeated: it is multiplied by each element in "x2".  
# This makes scalar multiplication easy.

# Vectors can also be created by the "c()" function.
# "c" stands for "concatenate".
myvec <- c(8,13,2,1,6)
myvec

# Now let's create a function that 
myDev <- function(x) { x - mean(x) }


# ---------------------------------------------------------------------
# PART 3: LOGICALS
# ---------------------------------------------------------------------


# Remember our old friend Fred?
# The variable Fred contains the numeric value 4
Fred

# Is Fred _really_ a numeric value?
is.numeric(Fred)

# Does Fred equal four?
Fred == 4

# Is Fred less than 3?
Fred < 3

# Is Fred between 3 and 10?
Fred > 3 & Fred < 10

# Is Fred either greater than 5 or equal to 2?
Fred > 5 | Fred == 2

# Which elements of x2 are greater than 5?
x2 > 5

# Let's check by printing out x2
x2

# How many elements of x2 are greater than 5
sum(x2 > 5)

# Logicals can be treated as 0's (FALSE) and 1's (TRUE)

# Let's write a function that counts how many elements in a vector
#   are greater than some other value.

myCountGreater <- function(x, threshold) {
    sum(x > threshold) 
}


# ---------------------------------------------------------------------
# PART 4: FUNCTIONS
# ---------------------------------------------------------------------

# R is a very *functional* language. 
# Most of what you do in R is use functions.
# We've already seen a couple of functions. 
# Let's look more in depth:

# sqrt() is a "function" and the "argument" is 16.
sqrt(16)

# Functions take arguments, do something, and then return something.
# Every time you see a word followed by parentheses,
# you are seeing a function.

# help() is a function that prints the help page for a function
help(sqrt)

# This does the same thing as help(sqrt)
?sqrt

# If you don't know the name of a function, but you know what it
# does, you can search for it using help.search()
help.search("skewness")

# This is the formal way to use functions.
# We can explicitly name the argument x.
sqrt(x=16)


# Some functions require no input.
# This prints to screen every object we've created so far
ls()

# This function creates a vector of 50 pseudo random numbers 
# drawn from a normal distribution with mean = 0 and sd = 1
dat <- rnorm(50, mean=0, sd=1.0)
dat

# How many elements are in dat?
length(dat)

# There are lots of useful statistical functions pre-defined
mean(dat)
median(dat)
var(dat)
sd(dat)
range(dat)
min(dat)
max(dat)
round(dat, 3)

# Now, let's create a matrix
MAT <- matrix(1:50, nrow=10, ncol=5)  

# The matrix function is
?matrix

# The dimensions of MAT are 5 rows, 2 columns
dim(MAT)             

# We can also "bind" vectors together to make matrices.
vec1 <- 1:5
vec2 <- vec1 * 2
MAT3 <- cbind(vec1, vec2)
MAT3
# The dimensions of MAT3 are 5 rows, 2 columns

MAT4 <- rbind(vec1, vec2)
MAT4
# The dimensions of MAT4 are 2 rows, 5 columns


# ---------------------------------------------------------------------
# PROBLEM SET 1
#   Put your work directly into this script, below the q's
# ---------------------------------------------------------------------
#
# a) Create a vector of 100 normally distributed random variables (mean = 
#    0 & sd = 1). Assign it to "Y"
#
# b) Create another vector, "Z", of 100 normally distributed random
#    numbers with mean = 100 and the sd = 15. 
#    HINT: See the help function if you get stuck!
#
# c) Create another variable, "Sum.dist", that is the sum of Y and Z
#
# d) Put the vector "Sum.dist" into a matrix with 20 rows and 5 columns.
#    Do so such that the numbers are put in BY ROW. 
#    Call the matrix "My.Mat"
#
# ---------------------------------------------------------------------


# ---------------------------------------------------------------------
# PART 5: READING AND WRITING DATA ON THE DISK
# ---------------------------------------------------------------------

# First, we need to set the Project Folder (Working Directory) for R.
# You can either set the Project Folder in the GUI or in a script.
#
# The Project Folder is where R will look if you read a file.
# The Project Folder is where R will write if you write a file.
# The Project Folder is where R can "save its state" at the
#   end of a session.
# 
# We recommend you create a new folder for each of your projects.
# That way you can save everything about a project together.
# Also, that way you can "save state" of R and it only applies to
#   one project.


# To display the Project Folder
getwd()

# We created a new Project at the beginning of this session.

# To list all files in your working directory
list.files()         

# IMPORTANT POINT:
#  Do you see the file named TwinExample1.csv after you hit list.files()? 
#  If not, please try again and then raise your hand if you are 
#  still having trouble finding TwinExample1.csv


# We will use "csv" files (comma separated values) for this workshop.  
#  The csv format is the most general format for data files.s
#  You can save csv files from SPSS, SAS, or Excel.  
#  And you can even open it up in a text editor.

# Now we will read our first data file.
TwinExample1 <- read.csv(file="TwinExample1.csv", header=TRUE)

# We have created a "data.frame" called TwinExample1.
is.data.frame(TwinExample1)

# Summary is one way of finding out about any object
summary(TwinExample1)

# R has hundreds of contributed libraries for all sorts of statistics
# A library called "psych" has functions useful in behavioral genetics.
library(psych)

# Now let's get some better descriptive statistics about TwinExample1
describe(TwinExample1)


# ---------------------------------------------------------------------
# Part 6: SUBSETTING & INDEXING
# ---------------------------------------------------------------------

# We can select a variable from TwinExample1 by writing:
id <- TwinExample1$ID

# This selects the third element of zygo by using an index.
id[3]

# We could also select the third row and first column of TwinExample1
#   and obtain the same cell of the dataframe.
TwinExample1[3,1]

# If we omit the number after the comma, we obtain all columns.
TwinExample1[3, ]

# These data have one twin pair per line.
# But both MZ and DZ twins are in the same file.

# Let's create a "selection vector" to select just the MZ twins
#   and just the DZ twins
MZsel <- TwinExample1$Zygosity == "MZ"
DZsel <- TwinExample1$Zygosity == "DZ"

# MZsel is TRUE only for those rows of the dataframe that have MZ twins
MZsel 

# We can now obtain descriptive statistics for just the MZ twins
describe(TwinExample1[MZsel, ])

# And for just the DZ twins
describe(TwinExample1[DZsel, ])


# ---------------------------------------------------------------------
# PART 7: CORRELATIONS, LINEAR REGRESSION, AND SELECTION
# ---------------------------------------------------------------------

# Let's print the variance-covariance matrix bw the 3rd and 5th columns.
(varcov <- var(TwinExample1[,c(3,5)]))

# Now let's save a variance-covariance matrix for the MZ twins only.
(varcovMZs <- var(TwinExample1[MZsel,c(3,5)]))

# And a variance-covariance matrix for the DZ twins only.
(varcovDZs <- var(TwinExample1[DZsel,c(3,5)]))

# We can save the phenotypic variance of MZs and DZs like so:
(PhenVar <- mean(diag(varcov)))

# And can save the covariance of MZs & DZs like this:
(covMZs <- varcovMZs[1,2])
(covDZs <- varcovDZs[1,2])


# We can plot these data for the MZ twins and DZ twins

# The lattice library has many useful plotting functions
library(lattice)

# A Scatter Plot Matrix can be created by using the splom command
splom( ~ TwinExample1[,c(3,5)] | Zygosity, data=TwinExample1)

# The variables to plot go after the ~ (tilde) and the selection
#   factor goes after the | (vertical bar)

# Finally, we can obtain basic estimates of A, C, and E 
(A <- 2*(covMZs-covDZs))
(C <- (2*covDZs)-covMZs)
(E <- PhenVar - covMZs)


# ---------------------------------------------------------------------
# PROBLEM SET 2
#   Put your work directly into this script, below the q's
# ---------------------------------------------------------------------
#
# a) take a look at the scatterplots of weight separated by Zygosity
#    in the dataset TwinExample1
#
# b) Find the phenotypic variance of the weight variable in TwinExample1
#
# c) Find the covariance of weight for MZ twins in TwinExample1
#
# d) Find the covariance of weight DZ twins in TwinExample1
#
# e) Find estimates of A, C, and E for weight
#
# BONUS QUESTION - Only do this question if you've completed the above:
# create a new variable that is the average of the
# heights of the two twins. Then create a new dataset that only includes
# twins whose average height is over the median average height. Then
# refigure A, C, and E of weight only on this subset. Do estimates change?
#
#
# ---------------------------------------------------------------------