# --------------------------------------------------------------------- # Program: TW-IntroToR-20100210.R # Author: Matt Keller, Steve Boker, Michael Spiegel # Date: Mon Mar 5 09:12:21 EST 2012 # # WELCOME TO THE WONDERFUL WORLD OF R! # # This is an introductory script for learning the very basics # of R. This header is a comment. Any text after a hash mark (#) # is ignored by R until the next carriage return. So, if you # put a hash mark at the beginning of a line, the whole line # is ignored. # # This script is formatted for a text window that is 72 columns wide. # # As we go through this file, we encourage you to write your # own comments so that you can come back to this file and # remind yourself of what was happening. # # We encourage you to make these notes in all of the scripts # in the workshop. It will thus be easier to come back later # and modify them for your own use. # # --------------------------------------------------------------------- # Revision History # Matt Keller & Steve Boker -- Wed Feb 10 12:02:11 EST 2010 # Created TW-IntroToR-20100210.R. # Keller, Boker, & Spiegel -- Mon Mar 5 09:13:12 EST 2012 # Revision to more closely support the Twin Workshop curriculum. # # Note: I always put a revision history in all of my scripts! # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Variables (I always have a section here describing my variables) # --------------------------------------------------------------------- # # Fred -- Fred is a variable that has the numeric value 4 # # TwinExample1 -- simulated data frame # TwinExample1$ID -- ID number of twin pairs # TwinExample1$Zygosity -- "MZ" or "DZ" # TwinExample1$Height1 -- Height of twin 1 # TwinExample1$Weight1 -- Weight of twin 1 # TwinExample1$Height2 -- Height of twin 2 # TwinExample1$Weight2 -- Weight of twin 2 # # --------------------------------------------------------------------- # --------------------------------------------------------------------- # PART 1: USING R AS A CALCULATOR # --------------------------------------------------------------------- # Arithmetic with two numbers 2+2 2-4 2*3 8/3 # If you're like me, you don't remember order of operations 9+12*3 # So use parentheses to be explicit about what you want! (9+12)*3 # 3 squared 3^2 # 9 to the 1/2 power 9^.5 # That is the same as what is returned by the square root function sqrt(9) # In R you can make your own functions. mySqrt <- function(x) { x^.5 } # Now we can use it mySqrt(9) # --------------------------------------------------------------------- # PART 2: ASSIGNMENT AND OBJECT CREATION # --------------------------------------------------------------------- # Let's make a variable called Fred Fred <- 2+2 # Congrats! You've just made your first variable. # We say, "Fred gets two plus two". The arrow (less than, dash) # "assigns" 2+2 to Fred. You'll use the "gets arrow" all the time! # Type Fred on a line by itself and press return and you find out # what is contained in Fred Fred # Caps matter. There is no object "fred". Your first error message! fred # Error messages are our friends :) They give us tough love. # One variable can be assigned to another. a <- Fred # Now 'a' is a copy of Fred. a # We read the next line as "A gets square root of 'a' times five" A <- sqrt(a)*5 A # This one is a little tricky. We can re-assign A. A <- A*2 A # Here, we create a vector of length 13; notice what ":" does x2 <- 1:13 x2 # The "*" does element-wise multiplication. x2*3 # Since there are fewer elements in "3" than in "x2", "3" is # repeated: it is multiplied by each element in "x2". # This makes scalar multiplication easy. # Vectors can also be created by the "c()" function. # "c" stands for "concatenate". myvec <- c(8,13,2,1,6) myvec # Now let's create a function that myDev <- function(x) { x - mean(x) } # --------------------------------------------------------------------- # PART 3: LOGICALS # --------------------------------------------------------------------- # Remember our old friend Fred? # The variable Fred contains the numeric value 4 Fred # Is Fred _really_ a numeric value? is.numeric(Fred) # Does Fred equal four? Fred == 4 # Is Fred less than 3? Fred < 3 # Is Fred between 3 and 10? Fred > 3 & Fred < 10 # Is Fred either greater than 5 or equal to 2? Fred > 5 | Fred == 2 # Which elements of x2 are greater than 5? x2 > 5 # Let's check by printing out x2 x2 # How many elements of x2 are greater than 5 sum(x2 > 5) # Logicals can be treated as 0's (FALSE) and 1's (TRUE) # Let's write a function that counts how many elements in a vector # are greater than some other value. myCountGreater <- function(x, threshold) { sum(x > threshold) } # --------------------------------------------------------------------- # PART 4: FUNCTIONS # --------------------------------------------------------------------- # R is a very *functional* language. # Most of what you do in R is use functions. # We've already seen a couple of functions. # Let's look more in depth: # sqrt() is a "function" and the "argument" is 16. sqrt(16) # Functions take arguments, do something, and then return something. # Every time you see a word followed by parentheses, # you are seeing a function. # help() is a function that prints the help page for a function help(sqrt) # This does the same thing as help(sqrt) ?sqrt # If you don't know the name of a function, but you know what it # does, you can search for it using help.search() help.search("skewness") # This is the formal way to use functions. # We can explicitly name the argument x. sqrt(x=16) # Some functions require no input. # This prints to screen every object we've created so far ls() # This function creates a vector of 50 pseudo random numbers # drawn from a normal distribution with mean = 0 and sd = 1 dat <- rnorm(50, mean=0, sd=1.0) dat # How many elements are in dat? length(dat) # There are lots of useful statistical functions pre-defined mean(dat) median(dat) var(dat) sd(dat) range(dat) min(dat) max(dat) round(dat, 3) # Now, let's create a matrix MAT <- matrix(1:50, nrow=10, ncol=5) # The matrix function is ?matrix # The dimensions of MAT are 5 rows, 2 columns dim(MAT) # We can also "bind" vectors together to make matrices. vec1 <- 1:5 vec2 <- vec1 * 2 MAT3 <- cbind(vec1, vec2) MAT3 # The dimensions of MAT3 are 5 rows, 2 columns MAT4 <- rbind(vec1, vec2) MAT4 # The dimensions of MAT4 are 2 rows, 5 columns # --------------------------------------------------------------------- # PROBLEM SET 1 # Put your work directly into this script, below the q's # --------------------------------------------------------------------- # # a) Create a vector of 100 normally distributed random variables (mean = # 0 & sd = 1). Assign it to "Y" # # b) Create another vector, "Z", of 100 normally distributed random # numbers with mean = 100 and the sd = 15. # HINT: See the help function if you get stuck! # # c) Create another variable, "Sum.dist", that is the sum of Y and Z # # d) Put the vector "Sum.dist" into a matrix with 20 rows and 5 columns. # Do so such that the numbers are put in BY ROW. # Call the matrix "My.Mat" # # --------------------------------------------------------------------- # --------------------------------------------------------------------- # PART 5: READING AND WRITING DATA ON THE DISK # --------------------------------------------------------------------- # First, we need to set the Project Folder (Working Directory) for R. # You can either set the Project Folder in the GUI or in a script. # # The Project Folder is where R will look if you read a file. # The Project Folder is where R will write if you write a file. # The Project Folder is where R can "save its state" at the # end of a session. # # We recommend you create a new folder for each of your projects. # That way you can save everything about a project together. # Also, that way you can "save state" of R and it only applies to # one project. # To display the Project Folder getwd() # We created a new Project at the beginning of this session. # To list all files in your working directory list.files() # IMPORTANT POINT: # Do you see the file named TwinExample1.csv after you hit list.files()? # If not, please try again and then raise your hand if you are # still having trouble finding TwinExample1.csv # We will use "csv" files (comma separated values) for this workshop. # The csv format is the most general format for data files.s # You can save csv files from SPSS, SAS, or Excel. # And you can even open it up in a text editor. # Now we will read our first data file. TwinExample1 <- read.csv(file="TwinExample1.csv", header=TRUE) # We have created a "data.frame" called TwinExample1. is.data.frame(TwinExample1) # Summary is one way of finding out about any object summary(TwinExample1) # R has hundreds of contributed libraries for all sorts of statistics # A library called "psych" has functions useful in behavioral genetics. library(psych) # Now let's get some better descriptive statistics about TwinExample1 describe(TwinExample1) # --------------------------------------------------------------------- # Part 6: SUBSETTING & INDEXING # --------------------------------------------------------------------- # We can select a variable from TwinExample1 by writing: id <- TwinExample1$ID # This selects the third element of zygo by using an index. id[3] # We could also select the third row and first column of TwinExample1 # and obtain the same cell of the dataframe. TwinExample1[3,1] # If we omit the number after the comma, we obtain all columns. TwinExample1[3, ] # These data have one twin pair per line. # But both MZ and DZ twins are in the same file. # Let's create a "selection vector" to select just the MZ twins # and just the DZ twins MZsel <- TwinExample1$Zygosity == "MZ" DZsel <- TwinExample1$Zygosity == "DZ" # MZsel is TRUE only for those rows of the dataframe that have MZ twins MZsel # We can now obtain descriptive statistics for just the MZ twins describe(TwinExample1[MZsel, ]) # And for just the DZ twins describe(TwinExample1[DZsel, ]) # --------------------------------------------------------------------- # PART 7: CORRELATIONS, LINEAR REGRESSION, AND SELECTION # --------------------------------------------------------------------- # Let's print the variance-covariance matrix bw the 3rd and 5th columns. (varcov <- var(TwinExample1[,c(3,5)])) # Now let's save a variance-covariance matrix for the MZ twins only. (varcovMZs <- var(TwinExample1[MZsel,c(3,5)])) # And a variance-covariance matrix for the DZ twins only. (varcovDZs <- var(TwinExample1[DZsel,c(3,5)])) # We can save the phenotypic variance of MZs and DZs like so: (PhenVar <- mean(diag(varcov))) # And can save the covariance of MZs & DZs like this: (covMZs <- varcovMZs[1,2]) (covDZs <- varcovDZs[1,2]) # We can plot these data for the MZ twins and DZ twins # The lattice library has many useful plotting functions library(lattice) # A Scatter Plot Matrix can be created by using the splom command splom( ~ TwinExample1[,c(3,5)] | Zygosity, data=TwinExample1) # The variables to plot go after the ~ (tilde) and the selection # factor goes after the | (vertical bar) # Finally, we can obtain basic estimates of A, C, and E (A <- 2*(covMZs-covDZs)) (C <- (2*covDZs)-covMZs) (E <- PhenVar - covMZs) # --------------------------------------------------------------------- # PROBLEM SET 2 # Put your work directly into this script, below the q's # --------------------------------------------------------------------- # # a) take a look at the scatterplots of weight separated by Zygosity # in the dataset TwinExample1 # # b) Find the phenotypic variance of the weight variable in TwinExample1 # # c) Find the covariance of weight for MZ twins in TwinExample1 # # d) Find the covariance of weight DZ twins in TwinExample1 # # e) Find estimates of A, C, and E for weight # # BONUS QUESTION - Only do this question if you've completed the above: # create a new variable that is the average of the # heights of the two twins. Then create a new dataset that only includes # twins whose average height is over the median average height. Then # refigure A, C, and E of weight only on this subset. Do estimates change? # # # ---------------------------------------------------------------------