##################################################################### # Program: IntroToR2018.R # Author: Elizabeth Prom-Wormley # Date: March 2, 2018 # # Questions??? # Just ask in person or virtually by e-mail # Elizabeth.Prom-Wormley@vcuhealth.org # # # This script will help you learn enough R basics to get # the most of this week. # # # This header is a comment. Any text after a hash mark (#) # is ignored by R until the next carriage return. So, if you # put a hash mark at the beginning of a line, the whole line # is ignored. # # This script is formatted for a text window that is 72 columns wide. # # As we go through this file, you are welcome to write your # own comments so that you can come back to this file and # remind yourself of what was happening. These can also serve as # class notes. # # We encourage you to make these notes in all of the scripts # in the workshop. It will be easier to come back later # and modify them for your own use. # # Learning goals for this script/lecture are as follows: # # 1- Find/open files # 2- Learn how R works and thinks about files/data # 3- How to use R to get a feel for the data # 4- Basic data management # 5- Summary Statistics # 6- Summarizing results for use outside of R # 7- Introduction to functions #################################################################### #################################################################### # 1- OPENING A FILE #################################################################### # First, we have to know where to find the file getwd() # PC Users setwd('/home/elizabeth/2018') #Mac users setwd('/Users/ecpromwormle/Documents/MyDocuments/OZclass2016') getwd() # Now, we can open the file YTD <- read.table(file = "twinData.csv", header = TRUE, sep = ",") # Translation # We just made an object called "YTD" that came from reading a file # called "YoungTwinData.csv" that has variables names (header=TRUE) # and is comma delimited (sep=",") #################################################################### # 2- LEARNING HOW R THINKS/WORKS WITH DATA #################################################################### # First, know your data structure # Why should you care? class(YTD) # There are four typical classes of data structures: data frames, # matrices, vectors, and lists. Your need for a specific data # structure typically depends on the function you are using. # We will focus most of our time this week using data frames. # We will discuss a bit over the next couple of lines # For more information on data types visit # http://adv-r.had.co.nz/Data-structures.html # DATA FRAME- Most general data structure because you can have many # different types of data (ie: one variable is character and another # is numeric) head(YTD) is.data.frame(YTD) # LIST- A collection arbitrary objects (can be of different # types) under a single name data.frames have the structure of # a type of list. Requirement that all list elements are # of the same length. alist <- list(one= 1:100, two= c("a","b",LETTERS[18]), three= pi) class(alist) is.list(alist) # MATRIX- All elements of a matrix have the same mode # (numeric or character). They allow for efficient mathematical # operations and manipulation not available for data frames. head(YTDM2) is.matrix(YTDM2) # VECTOR- One long row of data. Allows for efficient operations # not available for data frames. YTD[,3] is.vector(YTD[,3]) ls() # Removing an object (or two) rm(list=c("alist")) #################################################################### # 3- GETTING A FEEL FOR THE DATA #################################################################### # How do we know what's in that data frame? dim() names() head() tail() str() summary() # What if we want to look at one specific variable (age)? YTD$age1 # We could also use the index number assigned to that variable YTD[7] # This works too YTD[, 7] # What does this do? YTD[7, ] #################################################################### # 4- BASIC DATA MANAGEMENT #################################################################### # Frequencies # Coding note 1 - zyg is coded as follows: 1=MZf, 2=MZm, 3=DZf, 4=DZm, 5=DZfm, 6 = DZmf table(YTD$zyg, useNA ="always") # Can also use the CrossTable function in descr install.packages("descr") require(descr) # Translation- In the two lines above we installed a package called descr. # There are many packages in R to allow you to do many things and it would # take too much memory to download all the packages. So, you can just open one # of interest. In this case, gmodels. It has a CrossTable(x = as.factor(YTD$zyg), format = "SAS", missing.include = TRUE) # Coding note 2 - sex is coded as follows: XXXX # CLASS TASK- Figure out how sex is coded. CrossTable(x = as.factor(YTD$zyg), y = as.factor(YTD$sex1), format = "SAS", missing.include = FALSE) # Recoding variables (object elements) install.packages("car") require(car) YTD$zyg2 <- recode(YTD$zyg,"1 = 'MZ'; 2 = 'MZ' ; 3 = 'DZ'; 4 = 'DZ'; 5 = 'DZ'; 6='DZ' ") # Transforming variables YTD$age1t <- log10(YTD$age1) YTD$age2t <- log10(YTD$age2) # Creating new variables # Two Categories quantile (YTD$mem1, probs = 0.9) quantile (YTD$mem2, probs = 0.9) YTD$memBin1 <- ifelse(YTD$mem1 > 1.247992, yes = 1, no = 0) YTD$memBin2 <- ifelse(YTD$mem2 > 1.287143, yes = 1, no = 0) # Three Categories quantile (YTD$mem1, probs = 0.9) quantile (YTD$mem1, probs = 0.75) YTD$MemCat3t1[YTD$mem1 <= 0.6753861] <- 0 YTD$MemCat3t1[0.6753861< YTD$mem1 & YTD$mem1 <= 1.247992 ] <- 1 YTD$MemCat3t1[YTD$mem1 > 1.247992 ] <- 2 # Sorting by ord1 YTDsorted <- YTD[order(YTD$ord1), ] head(YTDsorted) print(YTDsorted$ord1) # Selecting Variables YTDsmall <- YTD[c(1,2,3,4,5,6,7,8,12,13)] # what if you wanted all variables except for zygosity? YTDno2 <- YTD[-c(2)] # Selecting Observations YTD8 <- YTD[which(YTD$age1 >= 8), ] # What if we wanted to add another filter for selection? # Logical Operator Review # & and # | or # == equal to # != not equal to # <= less than or equal to # >= greater than or equal to # (x >= 0) & (x<=1) x is between zero and one # Subsetting-- what are we doing here? Vars <- 'mem' nv <- 1 ntv <- nv*2 selVars <- paste(Vars, c(rep(1,nv), rep(2,nv)), sep = "") mzData <- subset(YTD, zyg == 1, selVars) dzData <- subset(YTD, zyg == 3, selVars) #################################################################### # 5a- GETTING TO KNOW THE DATA- SUMMARY STATISTICS #################################################################### # Basic summary statistics summary() mean() var() cov() cor() # Also polycor package for polychoric, polyserial correlations # See- http://cran.r-project.org/web/packages/polycor/polycor.pdf #################################################################### # 5b- GETTING TO KNOW THE DATA- GRAPHING #################################################################### hist(YTD$mem1, col = "gray", xlab = "Measure", ylab = "Weight", main = "Distribution of Weight for Twin 1") plot(mzData$mem1, mzData$mem2, col = 'red', pch = 2, main = "MZ Correlations for MEM") plot(dzData$mem1, dzData$mem2, col = 'blue', pch = 3, main = "DZ Correlations for MEM") qqnorm() qqline() boxplot(YTD[c()]) ### Graphing the same thing over several variables ### par(mfcol=c(1,3)) for (i in 9:11){ hist(YTD[, i], main= paste(c("Histogram","of", names(YTD[i])), sep = " "), probability = TRUE, col = "lightblue") d1<-density(YTD[,i], na.rm = TRUE) lines(d1, lty = 2, lwd = 3) } # Why did I use both"YTD[,i]" and "YTD[i]" in the # prior example? # Putting it All Together # CLASS TASK- Advanced Option # 1- Calculate twin correlations for MZ and DZ pairs for mem # 2- Plot the correlations as barplots # CLASS TASK ANSWERED- Calculating Twin Correlations by Zygosity # par(mfcol = c(1,1)) rMZ <- cor(mzData$mem1, mzData$mem2, use = "pairwise.complete.obs") rDZ <- cor(dzData$mem1, dzData$mem2, use = "pairwise.complete.obs") # What is happening here? What is cbind()?# corrs <- cbind(rMZ, rDZ) rbind() # Setup overall background to make transfer to slides prettier par(bg = "white") # Making barplot of twin correlations and putting a box around figure barplot(corrs, ylim = c(0,1), col = c("cyan"), cex.axis = 2, cex.names = 2, space = 0.1) box() # Putting it all together- Reporting Results # CLASS TASK- Advanced Option # 1- Calculate means, variances, and covariances of mem # for each twin by zygosity (ie: mean of mem1 for MZ pairs) # 2- Aggregate MZ results in a single object # 3- Ouptut MZ results to a comma delimited (.csv) file for # future use in another program ## CLASS TASK ANSWERED xmem1mz <- mean(x = mzData$mem1, na.rm = TRUE) xmem2mz <- mean(x = mzData$mem2, na.rm = TRUE) xmem1dz <- mean(x = dzData$mem1, na.rm = TRUE) xmem2dz <- mean(x = dzData$mem2, na.rm = TRUE) varmem1mz <- var(x = mzData$mem1, na.rm = TRUE) varmem2mz <- var(x = mzData$mem2, na.rm = TRUE) varmem1dz <- var(x = dzData$mem1, na.rm = TRUE) varmem2dz <- var(x = dzData$mem2, na.rm = TRUE) mzcov <- cov(x = mzData$mem1, y = mzData$mem2, use = "pairwise.complete.obs") dzcov <- cov(x = dzData$mem1, y = dzData$mem2, use = "pairwise.complete.obs") #################################################################### # 5- USING RESULTS OUTSIDE OF R #################################################################### # Result summaries for general summary statistics results <- round(c(xmem1mz, varmem1mz, mzcov),2) names(results) <- c("meanmemt1MZs", "varmemt1MZs" , "memmzcov") write.table(x = results, file = "MZresults.csv", sep = ",", col.names = TRUE, row.names = FALSE) # Result summaries for frequencies # Advanced Option # 1- Make a frequency table of ord1 by sex1 and report the number of people in each # age category. Report both N and %. # # 3- Output results into a comma delimited file (.csv) for use # in another program (ie: Excel) for reporting # One approach to the frequency problem from above AgeFreq <- CrossTable(x = as.factor(YTD$agecat), format = "SAS", missing.include = TRUE) FreqOut1 <- c(AgeFreq$t[1], round(AgeFreq$prop.row[1],2)) FreqOut2 <- c(AgeFreq$t[2], round(AgeFreq$prop.row[2],2)) FreqOut3 <- c(AgeFreq$t[3], round(AgeFreq$prop.row[3],2)) Freqs <- as.data.frame(rbind(FreqOut1, FreqOut2, FreqOut3)) names(Freqs) <- c("N", "Percent") Freqs$AgeCat <- c("< 20", "20-27", "> 28") Freqs2 <- Freqs[c(3,1,2)] write.table(x = Freqs2, file = "Freqs2.csv", sep = ",", col.names = TRUE, row.names = FALSE) # Saving Graphics # One way is simply copy and paste. Another is to output to # another file (ie: PDF) pdf(file="TwinCorrelations.pdf") # Setup overall background to make transfer to slides prettier par(bg = "white") par(mfcol=c(1,2)) # Making barplot of twin correlations and putting a box around figure barplot(corrs, ylim = c(0,1), col = c("cyan"), cex.axis = 2, cex.names = 2, space = 0.1) box() dev.off() # Just for your knowledge. # Removing objects in your working directory # Be careful when you run this. # ls() # rm(list=ls()) # ls() #################################################################### # 5- BASIC FUNCTION DEVELOPMENT #################################################################### # Function refresher- A function is previously developed code that # the user invokes to do something that they may do often. # Example- Running a simple correlation cor(YTD$wt1, YTD$wt2, use = "pairwise.complete.obs", method = 'pearson') # But what does cor() do, really?? ?cor() cor # What if you wanted to write your own function to do your own thing? # Data Preparation Step YTDt1 <- YTD[c(1,2,3,6,12)] YTDt2 <- YTD[c(1,2,4,7, 13)] names(YTDt1) <- c("ID", "zyg", "sex", "age", "mem") names(YTDt2) <- c("ID", "zyg", "sex", "age", "mem") YTDt1$twinid <- 1 YTDt2$twinid <- 2 PopData <- rbind(YTDt1, YTDt2) # When analyzing twin/family data it is necessary to change data # configuration so that each twin/family exists on its own line # Steps requried (focusing on twin data) # 1- Make a twin one object (twin1) # 2- Make a twin two object (twin2) # 3- Merge twin one and twin two objects by family number and provide an # indicator on a variable name that demonstrates whether the variable # belongs to twin one or twin two # Try it yourself! datT1 <- PopData[PopData$twinid == min(PopData$twinid),] datT2 <- PopData[PopData$twinid == max(PopData$twinid),] datTP <- merge(datT1, datT2, by = c("ID", "age", "zyg"), all.x = TRUE, all.y = TRUE, suffixes = c("_T1","_T2")) datTP$twinid_T1 <- NULL datTP$twinid_T2 <- NULL # This type of work is done so frequently, it might be worth making it more # generic to be able to do it whenever you want...or make a function # # NOTE: This is a very BASIC twin merging function and assumes complete # data for all twins on the variables for which you are merging # You will need to modify this to address issues such as missing zygosity # on one twin or another # # # # Define your function # We write the line that you will use to invoke a function called "twindat" # and defining all the necessary pieces of your function to make it work. # Also indicate that you have begun building the function, using # curly bracket- { twindat <- function(dat, famid, twinid, zyg) { # Function step 1- pull out Twin 1 and Twin 2 specific objects datT1 <- dat[dat[,twinid] == min(dat[,twinid]),] datT2 <- dat[dat[,twinid] == max(dat[,twinid]),] # Function step 2- merge Twin 1 and Twin 2 objects by family id DAT <- merge(datT1, datT2, by = c(famid), all.x = TRUE, all.y = TRUE, suffixes = c("_T1","_T2")) # Function step 3- output an object that contains the newly merged data # Also indicate end of function with curly bracket- } return(DAT) } # Use your function NewTwinData <- twindat(dat = PopData, famid = "ID", twinid = "twinid", zyg = "zyg") # How could I modify this function? #################################################################### # RESOURCES #################################################################### # 1- R for SAS and SPSS Users # A great handbook if you come from either of these programs and need # some help in translating into R # The following is a link to a pdf file with a shorter version of # the hard copy book # https://science.nature.nps.gov/im/datamgmt/statistics/R/documents/R_for_SAS_SPSS_users.pdf # 2- An R Style Guide # There are a few rules that help make it easier for you/others to # follow your R code. Suggestions are provided here # http://google-styleguide.googlecode.com/svn/trunk/Rguide.xml # 3- RSeek # A Google search engine for all things R related # http://www.rseek.org/ # 4- R-Bloggers # R-Bloggers.com is a central hub (e.g: A blog aggregator) of content # collected from bloggers who write about R (in English). # You can receive post updates by Facebook or daily e-mail # http://www.r-bloggers.com/ # 5- Quick-R's section on graphics # Good starting place to get some pointers for improving your figures # http://www.statmethods.net/advgraphs/parameters.html # 6- More information on data frames # http://www.r-bloggers.com/exploratory-data-analysis-useful-r-functions-for-exploring-a-data-frame/ # 7- Details on how to develop your own package # http://r-pkgs.had.co.nz/ # 8- Details on apply(), sapply(), and lapply() # http://www.r-bloggers.com/using-apply-sapply-lapply-in-r/