# LECTURE/HW #5 - REVIEW OF MATERIAL FROM FIRST FOUR LECTURES
#In this lecture/HW, I won't introduce anything new. Use your knowledge (and scripts) from the first four lectures to help you answer all the problems below. Good luck!


#1 R'S WORKING DIRECTORY ---------------------
#Read in a script I wrote that contains handy functions like info(x), LS(), and look(x), The function source() simply reads in another R script and runs every line of code
source("http://www.matthewckeller.com/R.Class/KellerScript2.R") 

#a) Create a new folder somewhere on your computer and make this folder your working directory for this session
setwd("~/Google Drive/DriveDocuments/Teaching/R/2020/Lectures/Lecture5-REVIEW")


#2 VECTORS ---------------------
# a) Create a vector, length=500, of random, uniformly distributed numbers between -1 and 1. Assign this to an object called "rand.unif"
rand.unif <- runif(500,-1,1)

# b) Create another vector of length=2000 of random, normally distributed numbers (mean=100, sd=15). Assign this to an object called "rand.norm"
rand.norm <- rnorm(2000,100,15)

# c) Create a vector of length 2500 whose first 500 elements are from rand.unif and last 2000 are from rand.norm.  Assign this to an object called "long.vector"
long.vector <- c(rand.unif,rand.norm)

#d) What are the dimensions of "long.vector"? (Hint: dim() doesn't give you what you want for vectors)
length(long.vector)

#e) Check that you've created the vector properly; run summary statistics on the first 500. Then do the same for the last 2000.
summary(long.vector[1:500])
summary(long.vector[501:2500])


#3 MATRICES & WRITING FILES ---------------------
#a) Use the object "long.vector" to create a matrix of 500 rows and 5 columns. The first column is rand.unif. The next four are rand.norm.  Assign this object to be called "sim.matrix"
sim.matrix <- matrix(long.vector,ncol=5)

#b) Use the object "long.vector" to create another matrix of 500 rows and 5 columns. The first 100 rows should be rand.unif. The next 400 rows are rand.norm.  Assign this object to be called "sim.matrix.2"
sim.matrix.2 <- matrix(long.vector,ncol=5,byrow=TRUE)

#c) Write the matrix "sim.matrix" out to your working directory as a comma delimitted file. Do NOT include column or row names. Call this file "My.Simulated.R.Matrix.txt"
write.table(sim.matrix,"My.Simulated.R.Matrix.txt",sep=",",col.names=FALSE,row.names=FALSE)

#d) Look at "My.Simulated.R.Matrix.txt" using your Terminal (e.g., the second tab next to Console). Does it look as expected?

#e) Let's say we want to round all the elements to 3 decimal places. Write out "sim.matrix" again as a comma delimitted file, calling it "My.Sim2.txt", but with all elements rounded to 3 decimal places
write.table(round(sim.matrix,3),"My.Sim2.txt",sep=",",col.names=FALSE,row.names=FALSE)

#f) Look at "My.Sim2.txt" using your Terminal (e.g., the second tab next to Console here in Rstudio). Does it look as expected?


#4 WORKING DIRECTORY, LISTING FILES, & READING FILES  ---------------------
#a) List the current files currently in your working directory
list.files()

# b) Create a character vector called "myfiles" that contains the names of all files in your working directory
myfiles <- list.files()

# c) Using the == operator, create a logical (TRUE/FALSE) vector that is TRUE when myfiles== "My.Simulated.R.Matrix.txt". Assign this to an object called "file.index"
file.index <- myfiles=="My.Simulated.R.Matrix.txt"

#d) Using "file.index" as an index, extract the element of "myfiles" that is equal to "My.Simulated.R.Matrix.txt".  Assign this to an object called "file.name"
file.name <- myfiles[file.index]

#e) Here is an idea that we'll use more as the course progresses. Try to use the object "file.name"  within the read.table function, in order to read the "My.Simulated.R.Matrix.txt" file into R. Assign this to a new object called "Read.Mat"
Read.Mat <- read.table(file.name,sep=",")

#f) What is the class of Read.Mat? What are its dimensions?
info(Read.Mat)

#g) Use the head function to look at the first 10 rows of Read.Mat
head(Read.Mat,n=10)


#5 RECODING   ---------------------
#a) Use the "colnames" function to rename the columns of Read.Mat. 1st column: "Level", 2nd column: "Y", 3rd column "Sex", 4th: "X1", 5th: "X2"
colnames(Read.Mat) <- c("Level","Y","Sex","X1","X2")

#b) Recode "Levels" to be an ordered factor, which is -1 if < -.333, 0 if -.3329 < "Level" < .3329  and 1 if > .333. Hint: recode to an ordered factor *after* changing the values.
Read.Mat$Level <- (Read.Mat$Level< -.333)* -1 + (Read.Mat$Level> .333)* 1
Read.Mat$Level <- as.ordered(Read.Mat$Level)

#c) Create a new variable in Read.Mat, "Female", that is 0 if below the mean of Sex and 1 if above the mean of Sex (note that "1" is for females and "0" is for males)
Read.Mat$Female <- NA
Read.Mat$Female <- factor((Read.Mat$Sex>mean(Read.Mat$Sex))*1)


#6 BASIC DESCRIPTIVES & SUBSETTING  ---------------------
#a) Find the mean Y score for females and the mean Y score for males
mean(Read.Mat[Read.Mat$Female==0,"Y"])
mean(Read.Mat[Read.Mat$Female==1,"Y"])
tapply(Read.Mat$Y,Read.Mat$Female,mean)

#b) Using the function tapply(), find the standard deviations of Y for the three levels of the "Level" factor. Place this in a 3-element vector entitled "SD.Levels"
SD.Levels <- tapply(Read.Mat$Y,Read.Mat$Level,sd)

#c) Using the apply() function, create another vector called "ind.means" that is the mean for each individual of Y, X1, and X2
ind.means <- apply(Read.Mat[,c("Y","X1","X2")],1,mean)

#d) Create a subset of Read.Mat this only includes rows 1, 3, 5, 7, 8, 9, & 10, and that only includes columns 1, 3, 4. Assign this to a new object called Read.Mat.subset1
Read.Mat.subset1 <- Read.Mat[c(1,3,5,7,8,9,10),c(1,3,4)]

#e) Create a subset of Read.Mat this only includes females. Assign this to a new object called Read.Mat.female
Read.Mat.female <- Read.Mat[Read.Mat$Female==1,]

#f) Create a subset of Read.Mat that only includes columns Y, X1, and X2 and only includes rows where the mean of Y, X1, and X2 is greater than 105. Assign this to a new object called "Read.Mat.105"
Read.Mat.105 <- Read.Mat[ind.means > 105,c(2,4,5)]


#7 PLOTTING AND FOR LOOPS  ---------------------
#Create a 3x3 plot using a for loop. Each quadrant of the plot is the scatterplot between X1 & Y from a random subset of rows:
#   The top row (3 plots) plots X & Y for three different randomly selected rows of size = 10
#   The 2nd row (3 plots) plots X & Y for three different randomly selected rows of size = 30
#   The 3rd row (3 plots) plots X & Y for three different randomly selected rows of size = 150
#   Make sure to create best fit lines for each plot. Finally, after you've plotted these, change R's graphing options back to how they were beforehand. This one's tough but you can do it. Good luck!!
op <- par(mfrow=c(3,3))
sub.size <- c(rep(10,3),rep(30,3),rep(150,3))
for (k in 1:9){
data <- Read.Mat[sample(1:nrow(Read.Mat),sub.size[k]),c(2,4)]
plot(data$Y,data$X1)
abline(lm(data$Y~data$X1))}
par(op)


data <- Read.Mat[sample(1:nrow(Read.Mat),sub.size[1]),c(2,4)]
plot(data$Y,data$X1)
abline(lm(data$Y~data$X1))

data <- Read.Mat[sample(1:nrow(Read.Mat),sub.size[2]),c(2,4)]
plot(data$Y,data$X1)
abline(lm(data$Y~data$X1))


#8 REGRESSION   ---------------------
#a) Run a linear regression of X1 & X2 & Levels on Y. Assign this to an object "lin.mod"
lin.mod <- lm(Y~X1+X2+Level,data=Read.Mat)

#b) Get the summary of lin.mod. Assign this summary to be "lin.mod.summary"
(lin.mod.summary <- summary(lin.mod))

#c) What are the effects being tested by the two Levels contrasts? Can you interpret them?
contrasts(Read.Mat$Level)

#d) What is the mode of "lin.mod.summary"?
mode(lin.mod.summary)
info(lin.mod.summary)

#e) What are the names of the objects in "lin.mod.summary"?
names(lin.mod.summary)

#f) Add two new columns to Read.Mat. The first column is "fitted" which are the fitted values (y-hats) of lin.mod, and the second new column is "resids" which are the residuals of lin.mod. See ?fitted and ?resid.
Read.Mat$fitted <- fitted(lin.mod)
Read.Mat$resids <- resid(lin.mod)

#g) create a plot that is residuals (y-axis) plotted against fitted values (x-axis). Is there any concern of heteroscedasticity?
plot(Read.Mat$fitted,Read.Mat$resids)

#h) Create a matrix ca;lled "reg.info" and place the intercept and slopes for lin.mod in the first column of "reg.info" and place the p-values in the second column of "reg.info". Hint: you can grab objects from lin.mod.summary using "$" - same as in data.frames (indeed, data.frames ARE lists). Alternatively, see ?coef.
lin.mod.summary$coefficients[,c(1,4)]
coef(lin.mod.summary)[,c(1,4)]


# BONUS QUESTION   ---------------------
# We'll go over it next class period: Our statistical question for the day is this: Do artifactual univariate outliers in X change the type I or type II error rates in regression? Artifactual outliers are ones that occur due to miscoding, instrument error, etc. They have no relation to the variable in question. To answer this, we'll need to simulate some data. A "Monte Carlo" way to approach this problem is to 1st simulate "population" data and resample from it in a loop. This turns out to be much faster than creating the data anew each iteration of the loop. To check both types of error rates, we'll need to simulate null data as well as data given some effect size. For each iteration, we'll then save the coefficient, the Std. Error, and the p-value. We'll loop through 1000 times. At the end, we'll need to figure out the type I error rates (% of times pval<.05 in the null data) and the type II error rates (% of times p > .05 in the true effect data). We'll also need to compare this to data without outliers to gauge the true type I and type II error rates.

# a) IF YOUR LAST NAME BEGINS WITH A - H, YOU WILL CHECK THE TYPE I ERRORS. RUN THIS CODE TO CREATE THE POPULATION DATA
x <- rnorm(50000)
out <- (runif(50000)<.10)*rnorm(5000,sd=4)   # 10 % of data is rnorm with sd = 3, rest is 0
x2 <- x+out    # roughly 10% of data has a chance of being an outlier - i.e., this data is heavy tailed
y <- rnorm(50000)  # the y variable; unrelated to the x variables
pop.null <- data.frame(y=y,x.norm=x,x.outlier=x2)  #we've created a population where there is no effect, but outliers exist
pop.null[1:15,]   # look at the data

# b) IF YOUR LAST NAME BEGINS WITH I - Z, YOU WILL CHECK THE TYPE II ERRORS. RUN THIS CODE TO CREATE THE POPULATION DATA 
x <- rnorm(50000)
out <- (runif(50000)<.10)*rnorm(5000,sd=4)   # 10 % of data is rnorm with sd = 3, rest is 0
x2 <- x+out    # roughly 10% of data has a chance of being an outlier - i.e., this data is heavy tailed
y <- sqrt(.15)*x + sqrt(.85)*rnorm(50000)  # x accounts for 15% of the variance in y
pop.alt <- data.frame(y=y,x.norm=x,x.outlier=x2)  #we've created a population where there IS an effect, and outliers exist
pop.alt[1:15,]   # look at the data

# c) Now run this to set up the matrix that we'll store our stats of interest in each round:
results.matrix <- matrix(0,nrow=1000,ncol=6)
dimnames(results.matrix) <- list(paste("run",1:1000),c("beta.no","se.no","p.no","beta.outlier","se.outlier","p.outlier"))

# d) Now it's up to you. You need to do the following things: 1) We're going to do 1000 iterations. For each iteration, randomly sample 50 rows from your population (another question we could ask is if sample size affects these results, but for now, let's keep sample size constant at 50). Run two linear regressions on this subsample. One is Y~x.norm and the other Y~x.outlier. Store the statistics of interest in the appropriate row of results.matrix. At the end, calculate the type I errors for the two alternative realities (outlier vs. no outlier) if in group 1, and the type II errors if in group 2. If you want to get fancy, you might also create histograms of the t-values from the two realities. ALSO, if you want, go ahead and do both type I and type II error rates - once you've done one, the other is easy.
#to get type-I errors, you want the proportion of tests that are significant (p < .05) in the null population (group 1)


# HOWEVER, before you start, *think* about the simulation. What will its outcome be? Provide your answer before you begin here:


# MY CODE:
results.matrix.2 <- results.matrix

# for group 1:
myrows <- nrow(pop.null)
sampsize <- 50

for (i in 1:1000){
  data <- pop.null[sample(1:myrows,sampsize),]
  results.matrix[i,1:3] <-  summary(lm(data[,1]~data[,2]))$coefficients[2,c(1,2,4)]
  results.matrix[i,4:6] <-  summary(lm(data[,1]~data[,3]))$coefficients[2,c(1,2,4)]
}

alpha.no <- (results.matrix[,3] < .05)*1
alpha.outlier <- (results.matrix[,6] < .05)*1
mean(alpha.no);mean(alpha.outlier)


# for group 2:
myrows <- nrow(pop.alt)

for (i in 1:1000){
  data <- pop.alt[sample(1:myrows,sampsize),]
  results.matrix.2[i,1:3] <-  summary(lm(data[,1]~data[,2]))$coefficients[2,c(1,2,4)]
  results.matrix.2[i,4:6] <-  summary(lm(data[,1]~data[,3]))$coefficients[2,c(1,2,4)]
}

beta.no <- (results.matrix.2[,3] < .05)*1
beta.outlier <- (results.matrix.2[,6] < .05)*1
mean(1-beta.no);mean(1-beta.outlier)

pdf("Alpha.Beta.Outlier.pdf",title="Alpha.Beta.Outliers.png")
par(mfrow=c(2,2))
hist(results.matrix[,1]/results.matrix[,2],main='no outliers, no effect',xlab='t values',breaks=seq(-10,10,by=1))
hist(results.matrix.2[,1]/results.matrix.2[,2],main='no outliers, effect',xlab='t values',breaks=seq(-10,10,by=1))
hist(results.matrix[,4]/results.matrix[,5],main='outliers, no effect',xlab='t values',breaks=seq(-10,10,by=1))
hist(results.matrix.2[,4]/results.matrix.2[,5],main='outliers, effect',xlab='t values',breaks=seq(-10,10,by=1))
dev.off()


#Check write.csv weirdness
x <- matrix(1:50,nrow=10)
write.table(x,row.names=F,col.names=F,quote=F,file="hello",sep=",")