# Program: IntroToR2021.R
# Author: Elizabeth Prom-Wormley
# Date: May 1, 2021
# 
# Questions??? Just ask by e-mail: Elizabeth.Prom-Wormley@vcuhealth.org
#
# This script will help you learn enough base R basics to get 
# you started. Here, we will focus on basic the R language
# More recently, tidyverse has become a popular wrap-around
# package/language that many people are also using for data management. 
# Also, ggplot2 is a popular package for graphing. Here,
# we focus on base R coding and encourage you to build on 
# your knowledge by adding tidyverse/ggplot2 afterwards.
#
# This header is a comment.  Any text after a hash mark (#)
# is ignored by R until the next carriage return.  So, if you
# put a hash mark at the beginning of a line, the whole line
# is ignored.
#
# This script is formatted for a text window that is 72 columns wide.
#
# As we go through this file, you are welcome to write your
# own comments so that you can come back to this file and
# remind yourself of what was happening. These can also serve as
# notes for yourself.
#
# We encourage you to make these notes in all of the scripts
# in the workshop.  It will be easier to come back later
# and modify them for your own use.	
#
# Learning goals for this script/lectures are as follows:
# 1- Find/open files
# 2- Explain how R works and thinks about files/data
# 3- Conduct basic data management 			
# 4- Generate basic graphics 
# 5- Estimate summary statistics
# 6- Generate basic output files
####################################################################
# 1- OPENING A FILE
####################################################################
# PC Users
# You can set your working directory by giving R the path of where you want to pull your data
# setwd('C:\\Users\\cliffordjs\\Desktop\\R Training Scripts') 

# Mac Users
setwd('/Users/ecpromwormle/Documents/MyDocuments/EpiAdmin/Rworkshop2019')

# Boulder workshop location
setwd('/home/elizabeth/2021/Opening Files in R')
getwd()
# TRANSLATION: "get working directory", this will return 
# the location where R is looking for files and saving output

# Now, we can open the files
bmx <- read.table(file = "BMX2018.csv", header = TRUE, sep = "," , na.strings = " ")
demo <- read.table(file = "demo2018.csv", header = TRUE, sep = "," , na.string = " ")

# TRANSLATION- We just made an object called "BMX" and "DEMO" that came from reading a file   
# called "BMX.csv" and "DEMO.csv" that has variables names (header=TRUE)   
# and is comma delimited (sep=",")
# Note that since we set our working directory where we saved the data files, we can call the
# files by their name rather than giving the full path. However, you can include the path
# as part of the file name if you prefer.


# Merging two objects by a common variable, in this case SEQN
fulldata<- merge(bmx, demo, by="SEQN")
?merge()

####################################################################
# 2 - GETTING A FEEL FOR THE DATA 
####################################################################

# There are four typical classes of data structures: data frames,
# matrices, vectors, and lists.  Your need for a specific data 
# structure typically depends on the function you are using.  
# We will focus most of our time using data frames.
# See https://intellipaat.com/tutorial/r-programming/data-structures-r-programming/
# for a more in-depth discussion of data structures


# How do we know what's in that data frame? Try these functions.

dim(fulldata)  
# TRANSLATION: Returns the number of rows and number of columns (in that order)

names(fulldata) 
# TRANSLATION: Returns all the variable names within your R object (data frame)

head(fulldata)  
# TRANSLATION: Returns the first 6 rows of the R object (default). 
# You can request  to see more rows. Try head(fulldata,10)

tail(fulldata)  
# TRANSLATION: Returns the last 6 rows of the R object
# You can request to see more rows. Try tail(fulldata,10)

str(fulldata)  
# TRANSLATION: Returns the STRucture of the R object 
# (type, name, first 10 entries per variable)

summary(fulldata)  
# TRANSLATION: Returns the Mean, minimum, maximum, 
# and 25th and 75th percentile of each variale


# What if we want to look at one specific variable (age)? 
# This returns a table with each possible value and the count
table(fulldata$RIDAGEYR)  

# Question: How many observations have an age of 5?


# Answer: There are 193 observations with an age = 5

# What about the mean of age?
# This returns the mean of the variable RIDAGEYR (age)
mean(fulldata$RIDAGEYR)  

# Calculate the mean again, now for BMXWT (weight).  What happens?
mean(fulldata$BMXWT)

# Uh oh!! A problem! We have not specified that there are values of
# 'NA' in the variable.  Missing (NA) cannot be used to calculate the mean, 
# but if we run summary(), we can see whether there are NAs are in the data
summary(fulldata$BMXWT)

# So we have missing data. Can we not calculate the mean? No way!
# We can solve this visiting the help for the mean() function and including
# information about how to deal with missing data. Previously, R was trying
# to calculate a mean with all the data including missing data. However,
# what is a mean of missing?  Missing. You can bypass this by telling R 
# to not include missing data in the calculation of mean. Try the line below.
mean(fulldata$BMXWT, na.rm = TRUE)


# Time to introduce a new function!
# The describe() function gives even more information for continuous variables 
# describe() comes from the psych package
# Packages are a group of functions other people have created and shared to be used
# Each comes with a pdf file that you may search for and will have every function
# in the package with a description and examples
# https://cran.r-project.org/web/packages/psych/psych.pdf
# The first time you run this line, you may be prompted to select a mirror for download
# This is because the files are hosted at different places, feel free to select any 
# mirror you want (I tend to pick the one physically closest to me)
# Also note that you only need to install the package once, it will always be on your
# work station.
# After running once you will only need to open the package to actually use it using
# the library() or require() functions

install.packages("psych", dependencies = T)
library(psych)

# Question: What does this code do and what do you learn about the data using this line below?
describe(fulldata$RIDAGEYR)


####################################################################
# 3- BASIC DATA MANAGEMENT 
####################################################################

# Adding labels to a variable
fulldata$RACE <- factor(fulldata$RIDRETH1, levels = c(1,2, 3, 4 ,5), 
                         labels = c("Mexican-Hispanic ", "Other-Hispanic",
                                    "Non-Hispanic-White", "Non-Hispanic-Black", 
                                    "Other"))

# TRANSLATION: We save the variable into a new variable, 'RACE' we tell R the original
# variable ('RIDRETH1') has 5 possible value ('levels') and then what we
# would like the values to say ('labels')

# Question: What is happening in these lines of code?
table(fulldata$RIDRETH1) 
table(fulldata$RACE) 


# Answer: We are looking at the distribution of the original variable 
# and our labeled variable

# Let's try with another variable
fulldata$GENDER <- factor(fulldata$RIAGENDR, levels = c(1,2), labels = c("Male", "Female"))

table(fulldata$RIAGENDR)
table(fulldata$GENDER)

# Transforming variables
# Below, we transform the variables 'BMXWT' and 'BMXHT' by taking the log 
# and save it as a new variable 'BMXWTt' and 'BMXHTt
fulldata$BMXWTt <- log10(fulldata$BMXWT) 
fulldata$BMXHTt <- log10(fulldata$BMXHT) 


# Recoding a variable with categories
# DMDMARTL has 8 different values, and you only want to know if the 
# person is single or not single, 
# so you can combine the categories
fulldata$married <- ifelse(fulldata$DMDMARTL == 1, 1, 0) 

# TRANSLATION: The ifelse() function is a logical operator, 
# if the logicial statement is true, then do X, else do Y.
# So here, if "DMDMARTL" is equal to 1, then save a value of 1, 
# otherwise give it a value of 0 and save it into the variable "married"

# Confirm what was coded by looking at the distribution of the new variable
# and comparing against the old variable
table(fulldata$married)

# More variable recoding.  This time we'll use a different function to do this.
# FIRST- We'll install a new package called 'car' 
# SECOND- We'll use a function within car called recode()
# https://cran.r-project.org/web/packages/car/car.pdf

install.packages("car", dependencies = T)
require(car)

# What if we know we have missing (NA) values in our data? They will erronously be 
# coded as 0 in the above code. We can fix by recoding the variable using the 
# "recode" function from the car package
fulldata$married <- recode(fulldata$DMDMARTL, "1=1; 2:6=0; 77:100 = 'NA' ")

# TRANSLATION: if DMDMARTL = 1, then code as 1, 
# if DMDMARTL is equal 2,3,4,5, or 6, then give a value of 0
# If DMDMARTL is between 77 and 100, then code it as "NA"
# NOTE!! As a learning opportunity, we have coded "NA" as a character. 
# 'NA' It IS NOT the same as the way that R understands missing (NA) 
# Note that there are no quotes around NA when it is missing. 
table(fulldata$married)

# Combine the levels of income into four categories using recode from package 'car':
# 1 = 0-24,999
# 2 = 25,000 - 54,999
# 3 = 55,000 - 74,999
# 4 > 75,000
fulldata$faminc <- recode(fulldata$INDFMIN2, "1:5 =1; 13=1; 6:7=2; 8:10=3; 
                          14:15=4; 12 = NA; 77:100 = NA")
table(fulldata$faminc)


# Creating new variables 
#You can write algebraic equations in R

#Creating a BMI variable
#BMI = kilograms/meters**2

# We replace "kilograms" and "meters" with our variables for height and weight
fulldata$bmi2<-(fulldata$BMXWT/(fulldata$BMXHT*fulldata$BMXHT))*10000


summary(fulldata$bmi2)

## Now we can round the number to be more interpretable, here to 1 decimal place
fulldata$bmi3 <- round(fulldata$bmi2,1)
summary(fulldata$bmi3)

# Create new variable for obese (1)/not obese (0)
fulldata$obese<- recode(fulldata$bmi3, "8:24.9=0; 25:85 = 1")
table(fulldata$obese)

#Creating a Poverty Level variable
# 1 = above poverty level
# 0 = below poverty level

fulldata$abovepov <-ifelse(fulldata$faminc==1, 0, 1)
table(fulldata$abovepov)


# Selecting Variables
# we select variables based on their order in the data frame
# Here we select columns 1 to 8, 12, 13, and 66 to 68
# Useful for selecting multiple variables for a smaller data set
# Up until this point, we have been referring to elements in an object 
# (aka: variables in a dataset) by convention of using an 
# object name and an element name separated by a dollar sign 
# (ex: fulldata$faminc refers to element "faminc" in the object "fulldata")

# However, this approach can take a lot of time effort if you need to work
# through many variables quickly. For example, if you want to work with a 
# dataset consisting of a smaller subset of variables.
# You can streamline your work by using 
# the number of the column where that variable resides. 
# You can determine the column numbers associated with an element by using names(). 
names(fulldata)

# Then you can identify the exact elements by number rather than using their names
fulldatasmall <- fulldata[c(1:8,12,13,66,67, 68)]


# what if you wanted all variables except for age?
# we add in a minus sign to indicate "NOT" this variale
fulldatano2 <- fulldata[-c(28)]


# Selecting Observations
# Here we will make an object that contains all the data from
# individuals who are 28 or older using the which function
# we select rows by adding in the comma after the logicial statement as R
# by default reads matricies as rows by columns (r x c)
fulldata8 <- fulldata[which(fulldata$RIDAGEYR >= 28), ]

# TRANSLATION: We ask R to produce a new object called fulldata8 that comes
# from the object fulldata. From fulldata, we will take all the observations
# with an age greater than or equal to 28 ("which(fulldata$RIDAGEYR >=28)"). 
# We will also include all the variables for these observations. 
# R understands this when it reads no numbers after the comma. See below for an
# example where we only take 3 variables.

# BONUS- We can also select individuals who are 28 and older for 3 specific variables
fulldata8a <- fulldata[which(fulldata$RIDAGEYR >= 28), c(1,2,3)]

# BONUS- We can also use the subset function to do the same thing. 
fulldata9 <- subset(fulldata,fulldata$RIDAGEYR >= 28, c(1,2,3))

# Logical Operator Review
# & and
# | or
# == equal to
# != not equal to
# <= less than or equal to
# >= greater than or equal to
# (x >= 0) & (x<=1)  x is between zero and one
# is.na(X)
# !is.na(X)

####################################################################
# 4- GETTING TO KNOW THE DATA- GRAPHING
####################################################################

hist(fulldata$bmi2, col = "gray", xlab = "BMI", ylab = "Count", 
     main = "Distribution of BMI")

boxplot(bmi2~faminc, data=fulldata, main="Income Vs Obesity Status",
        xlab="Income Level", ylab="Height", col = "gray") 


# Saving Graphics
# One way is simply copy and paste.  Another is to output to 
# another file (ie: PDF)

pdf(file="test_pics.pdf") # tell R we want to export pictures to pdf file
# While this will make a file, there will be nothing in it. We are getting the 
# file ready for the following code which will be written into that file

hist(fulldata$bmi2, col = "gray", xlab = "BMI", ylab = "Count", 
     main = "Distribution of BMI")

boxplot(bmi2~faminc,data=fulldata, main="Income Vs Obesity Status",
        xlab="Income Level", ylab="Height", col = "gray") 

dev.off() 
# turns off the function that writes graphs to pdf (ie. the pdf(file=....))

# Change graphing viewing paramter setup 
# Here, just changing the color of the overall background 
par(bg = "gray")

hist(fulldata$BMXARMC, col = "red", xlab = "Arm Circumference (cm)", ylab = "Count", 
     main = "Distribution of Arm Circumference for Participants")

# Setup output for multiple graphs per screen
# Put the background back to white first
par(bg = "white")


# This tells R that we want 1 row and two columns of graphs
par(mfcol=c(1,2))


#  The first graph
hist(fulldata$bmi2, col = "gray", xlab = "BMI", ylab = "Count", 
     main = "Distribution of BMI")


# Now add in the second graph

boxplot(bmi2~faminc,data=fulldata, main="Income Vs Obesity Status",
        xlab="Income Level", ylab="Height", col = "gray") 


####################################################################
# 5- GETTING TO KNOW THE DATA- SUMMARY STATISTICS
####################################################################

# Basic summary statistics
summary(fulldata$RIDAGEYR)
mean(fulldata$RIDAGEYR)
var(fulldata$RIDAGEYR)   # Gives us the variance of a variable
cov(fulldata$RIDAGEYR, fulldata$BMXARMC, use="complete.obs")  # Gives the covariance between
# two variables while removing missing individuals (use = "complete.obs")

cor(fulldata$RIDAGEYR, fulldata$BMXARMC, use="complete.obs")
# Gives us the correlation between two variables while removing individual with missing


# Frequencies 
table(fulldata$RIAGENDR, useNA ="always")

#If you want to find specific frequencies (like the number of each sex in each marital category), you can cross the frequencies
table(fulldata$RIAGENDR, fulldata$DMDMARTL)


# Can also use the CrossTable function in package, "descr"
# https://cran.r-project.org/web/packages/descr/descr.pdf
install.packages('descr', dependencies = T)
require(descr)

## 

CrossTable(x = fulldata$RIAGENDR,y = fulldata$married, format = "SAS", 
           missing.include = FALSE, chisq=T)

# Here is describe again!
describe(fulldata$RIDAGEYR)

## Can also pull out the quartiles
quantile (fulldata$RIDAGEYR, probs = 0.25, na.rm=TRUE) #na.rm is needed when there are missing values
quantile (fulldata$RIDAGEYR, probs = 0.5, ra.rm=TRUE)

### OPTIONAL QUESTION: How would you get the value for the third quartile? What is the third quartile value?
### OPTIONAL QUESTION: What is the median value for age?


# Let's see if there are significant difference in obesity by race, income, and gender
# For categorical variables: chisquare - to assess whether your categories differ from one another
chisq.test(fulldata$INDFMIN2,fulldata$obese)
chisq.test(fulldata$RIDRETH1, fulldata$obese)
chisq.test(fulldata$GENDER, fulldata$obese)


### OPTIONAL QUESTION: Among all race/ethnicity categories, which group has the highest proportion of obesity?
### OPTIONAL QUESTION: Is BMI category significantly different by race/ethnicity? How can you tell?
### OPTIONAL QUESTION: Which gender group has a higher proportion of obesity?
### OPTIONAL QUESTION: Among those who are obese, are there more men or women?
### OPTIONAL QUESTION: Is BMI category significantly different by gender? How can you tell?


### T tests. Look for a mean difference between groups on a continuous variable
t.test(BMXHT~obese, data = fulldata)

# SO..Is there a difference in the height between obese and non-obese individuals?


### Correlation
cor(fulldata$RIDAGEYR, fulldata$BMXWT, use="complete.obs")

# we get the correlation between age and weight, but no p value!
# Solution, use the cor.test function which will give us a p value

cor.test(fulldata$RIDAGEYR, fulldata$BMXWT, use="complete.obs")

# Same thing, but using height rather than weight

cor(fulldata$RIDAGEYR, fulldata$BMXHT, use="complete.obs")
cor.test(fulldata$RIDAGEYR, fulldata$BMXHT, use="complete.obs")


####################################################################
# 5- Linear Models and Logistic Regression
####################################################################
# Logistic regression should be used for outcomes that are dichotomous 
# (in our case, overweight or not overweight)
# This will produce odds ratios for interpretation for binary data (logistic regression)

logfit <- glm(obese~RIDAGEYR, data=fulldata, family=binomial(link=logit))
summary(logfit)

### Is this association significant? How do you know?


### Let's also examine the odds ratios 
# This line will give you the OR and 95% Confidence interval from a logistic regression
# by using the fitted R object
# we will exponentiate (exp) the coefficients (coef) and confidence intervals (confint)
# produced by the general linear model (glm)

exp(cbind(OR = coef(logfit), confint(logfit)))


### What if I want to change the reference level? By default, lowest number is reference

fulldata$obese2<-as.factor(recode(fulldata$obese, "0=1; 1=2"))
fulldata$obese3<-relevel(fulldata$obese2, ref = 2) # note that this is only done with factors

logfit2<-glm(obese3~RIDAGEYR, data=fulldata, family=binomial(link=logit))
summary(logfit2)


exp(cbind(OR = coef(logfit2), confint(logfit2)))


# Model of obesity by race;

logfit3<-glm(obese~RIDRETH1, data=fulldata, family=binomial(link=logit))
summary(logfit3)
exp(cbind(OR = coef(logfit3), confint(logfit3)))


## Is this association significant? How do you know?


# Linear Model of obesity by income;
logfit4<-glm(bmi2~relevel(as.factor(faminc), ref=4) + 
               as.factor(GENDER) + as.factor(married),  data=fulldata) 
summary(logfit4)


####Is this association significant? How do you know?


####################################################################
# 6 - USING RESULTS OUTSIDE OF R
####################################################################
# R files may be exported to numerous file types

# Exporting R dataset, note how I attach the file format I want in the new file name 
# ie. you must add the suffix ".txt" if you want a text file written
write.table(fulldata, file = "trainingdata.csv", col.names = TRUE, row.names = FALSE, sep = ",")

# Where can you find this file if you're looking through your files manually?
# In the working directory, ie. getwd()! 


# What about a CSV file?
write.csv(fulldata, file="trainingdata.csv")
# Again, notice you must put the suffix on the the file name

# Can I write individual results?
# Let's just write the beta coefficients from our model of obesity by gender
write.csv(coef(summary(logfit4)), file = "trainingoutput.csv")


####################################################################
# RESOURCES  
####################################################################


# 1- R for SAS and SPSS Users 
# A great handbook if you come from either of these programs and need 
# some help in translating into R
# The following is a link to a pdf file with a shorter version of 
# the hard copy book
# https://science.nature.nps.gov/im/datamgmt/statistics/R/documents/R_for_SAS_SPSS_users.pdf

# 2- An R Style Guide
# There are a few rules that help make it easier for you/others to  
# follow your R code.  Suggestions are provided here
# https://jef.works/R-style-guide/
# http://adv-r.had.co.nz/Style.html

# 3- RSeek
# A Google search engine for all things R related
# http://www.rseek.org/

# 4- R-Bloggers
# R-Bloggers.com is a central hub (e.g: A blog aggregator) of content    
# collected from bloggers who write about R (in English).
# You can receive post updates by Facebook or daily e-mail
# http://www.r-bloggers.com/

# 5- Quick-R's section on graphics
# Good starting place  to get some pointers for improving your figures
# http://www.statmethods.net/advgraphs/parameters.html

# 6- More information on data frames
# http://www.r-bloggers.com/exploratory-data-analysis-useful-r-functions-for-exploring-a-data-frame/

# 7- Details on how to develop your own package
# http://r-pkgs.had.co.nz/

# 8- Details on apply(), sapply(), and lapply()
# http://www.r-bloggers.com/using-apply-sapply-lapply-in-r/??