################################################ ## Twin-Based Introduction to R ## ################################################ ## This link provides some good introductory explanations to ## compare base R graphics ## and ggplot2 graphics which you will be learning later in the course. ## http://seriousstats.wordpress.com/2012/09/05/highqualitygraphs/ ## https://www.datanovia.com/en/blog/top-r-color-palettes-to-know-for-great-data-visualization/#viridis-color-palettes ## Questions? ## email- Elizabeth.Prom-Wormley@vcuhealth.org # Address for Boulder Workshop setwd('/home/elizabeth/2021/Working with Twin Data in R') # Code to install packages. You only need to do this once. #install.packages('psych', dependencies=TRUE) #install.packages('gplots', dependencies=TRUE) #install.packages('car', dependencies=TRUE) #install.packages('descr', dependencies=TRUE) #install.packages('gmodels', dependencies=TRUE) #install.packages("RColorBrewer") #install.packages('metafor', dependencies=TRUE) # Code to run packages in a given session. # You need to do this every time you have a new R session. library(psych) library(car) library(gmodels) library(descr) library(gplots) library(RColorBrewer) # Setting working directory. Your address will be different. # Mac address setwd('/Users/ecpromwormle/Documents/MyDocuments/Boulder/Boulder2021') # PC address example # setwd('C:\\Users\\cliffordjs\\Desktop\\R Training Scripts') twins<- read.table(file='tw19long_v3.txt',header=T,sep="\t") #twins2<- read.table(file='tw19wide_v3.txt',header=T,sep="\t") ## Review- What are ways we are able to get preliminary information ## on our dataset/variables? dim(twins) head(twins) names(twins) summary(twins) str(twins) # Frequency Tables using descr pacakage CrossTable(twins$SEX) #CrossTable(twins$ZYG2) #CrossTable(twins$vB2bin, twins$SEX) # Turn on PDF device #pdf(file="ClassGraphs.pdf") # Plot one histogram of vA1con hist(twins$vA1con, col="gray") #Plot continuous measures of interest all at the same time par(mfcol=c(3,3)) for (i in c(6,8,11,14,18,19,20,21,22)){ hist(twins[,i],main= paste(c("Histogram", "of",names(twins[i])),sep=" "), probability=TRUE,col= "gray", xlab= paste("mean",c(names(twins[i])), "=", round(mean(twins[,i],na.rm=TRUE),2))) d1<-density(twins[,i]) lines(d1) } for (i in c(6,8,11,14,18,19,20,21,22)){ qqnorm(twins[,i], main= paste(c("QQ Plot", "of",names(twins[i])),sep=" "), xlab= paste(c(names(twins[i])))) } ### Using boxplots boxplot(twins$vA1con~twins$SEX, col="gray") boxplot(twins$vA1con~twins$ZYG2, col="gray") for (i in c(6,8,11,14,18,19,20,21,22)){ boxplot(twins[,i]~twins$ZYG5, main= paste(c("Boxplot", "of", names(twins[i]), "by Zygosity"),sep=" "), col="gray") } ## LET'S PLAY! ## How would you alter the previous script to visualize ## whether there are differences by zygosity groups? # Turn off PDF device #dev.off() ### Moving on to working with twin data ### ## Get twin data in the right format for future use ## ## We will do this step-by-step first. ## Then, we will automate with a function. ## Last, we will share function that automates completely ## 1- Separate original object into two objects: ## A Twin 1 object and a Twin 2 object datA <- twins[twins[,"TWID"]==min(twins[,"TWID"]),] #twin1 object datB <- twins[twins[,"TWID"]==max(twins[,"TWID"]),] #twin2 object ## 2- Merge Twin 1 and Twin 2 objects by a Family ID ("FAMID") ## that is common to both ## 3- During Merge, ensure that data are differentiated between ## Twin 1 and Twin 2 (e.g., "VariableName1_T1 and VariableName1_T2") DAT <- merge(datA, datB, by="FAMID", all.x=TRUE, all.y=TRUE, suffixes=c("_T1","_T2")) ## 4- Remove data that is "unnecessary". ## TWID is no longer necesssary since we have indicated which data come ## from twin 1 and which come from twin 2 DAT[,paste("TWID","_T1",sep="")] <- NULL DAT[,paste("TWID","_T2",sep="")] <- NULL ## Make a single zygosity variable (ZYG2, ZYG5, ZYG6) ## rather than one for each twin. ## All of these different approaches do the same thing. DAT$ZYG5 <- ifelse(is.na(DAT$ZYG5_T1), DAT$ZYG5_T2, DAT$ZYG5_T1) DAT[,"ZYG2"] <- ifelse(is.na(DAT[,"ZYG2_T1"]), DAT[,"ZYG2_T2"], DAT[,"ZYG2_T1"]) zygosity<- "ZYG6" DAT[,zygosity] <- ifelse(is.na(DAT[,paste(zygosity,"_T1",sep="")]), DAT[,paste(zygosity,"_T2",sep="")], DAT[,paste(zygosity,"_T1",sep="")]) ## Removing Twin1 and Twin2 Zygosities DAT$ZYG2_T1<- NULL DAT$ZYG2_T2<- NULL DAT[,"ZYG5_T1"] <- NULL DAT[,"ZYG5_T2"] <- NULL DAT[,paste(zygosity,"_T1",sep="")] <- NULL DAT[,paste(zygosity,"_T2",sep="")] <- NULL ## Automate by putting together a twin-focused function. ## Put twin data all on one line using twin function. #INPUT: data.frame where each data for each individual is listed in separate record #OUTPUT: data.frame where twin pairs are listed in a single record twindat <- function(dat, famid, twinid, zygosity1, zygosity2, zygosity3) { datA <- dat[dat[,twinid]==min(dat[,twinid]),] #twin1 datB <- dat[dat[,twinid]==max(dat[,twinid]),] #twin2 DAT <- merge(datA, datB, by=famid, all.x=TRUE, all.y=TRUE, suffixes=c("_T1","_T2")) DAT[,paste(twinid,"_T1",sep="")] <- NULL DAT[,paste(twinid,"_T2",sep="")] <- NULL DAT[,zygosity1] <- ifelse(is.na(DAT[,paste(zygosity1,"_T1",sep="")]), DAT[,paste(zygosity1,"_T2",sep="")], DAT[,paste(zygosity1,"_T1",sep="")]) DAT[,paste(zygosity1,"_T1",sep="")] <- NULL DAT[,paste(zygosity1,"_T2",sep="")] <- NULL DAT[,zygosity2] <- ifelse(is.na(DAT[,paste(zygosity2,"_T1",sep="")]), DAT[,paste(zygosity2,"_T2",sep="")], DAT[,paste(zygosity2,"_T1",sep="")]) DAT[,paste(zygosity2,"_T1",sep="")] <- NULL DAT[,paste(zygosity2,"_T2",sep="")] <- NULL DAT[,zygosity3] <- ifelse(is.na(DAT[,paste(zygosity3,"_T1",sep="")]), DAT[,paste(zygosity3,"_T2",sep="")], DAT[,paste(zygosity3,"_T1",sep="")]) DAT[,paste(zygosity3,"_T1",sep="")] <- NULL DAT[,paste(zygosity3,"_T2",sep="")] <- NULL return(DAT) } # Four arguments needed for twindat() function Twins2 <- twindat(dat=twins, famid="FAMID", twinid="TWID", zygosity1="ZYG2", zygosity2="ZYG5", zygosity3="ZYG6") # Another way is by using the reshape function Twins2a <- reshape(data = twins, direction="wide", idvar = c('FAMID','ZYG2','ZYG5','ZYG6'), timevar = "TWID", sep = "_") write.table(Twins2a, file = "Twins2a.csv", row.names=FALSE, col.names=TRUE, sep = ",") ## REVIEW ## We identified 3 ways to prepare twin data for further analysis. ## 1- A self-coded step-by-step approach ## 2- An automated approach using a function that you personalize ## 3- An already available function ## What you decide may depend on how dirty your data are. ## WORKING WITH TWIN DATA ## ## Let's start getting a feel for the data dim(Twins2) names(Twins2) head(Twins2) MZ <- subset(Twins2,Twins2$ZYG2==1) DZ <- subset(Twins2,Twins2$ZYG2==2) # Scatterplot of twin relationships # par(mfcol=c(1,1)) plot(MZ$vA1con_T1,MZ$vA1con_T2, col="black") plot(DZ$vA1con_T1,DZ$vA1con_T2, col="black") plot(MZ$AGE_T1,MZ$AGE_T2, col="red") plot(DZ$AGE_T1,DZ$AGE_T2, col="blue") # Adding a loess curve par(mfcol=c(1,1)) pairs(MZ[c(4,29)],panel=panel.smooth, main="Scatterplot of MZ pairs") pairs(DZ[c(4,29)],panel=panel.smooth, main="Scatterplot of DZ pairs") ############################## # Graphing Twin Correlations # ############################## # Calculating Twin Correlations by Zygosity # rMZ<-cor(MZ$vA1con_T1,MZ$vA1con_T2,use="pairwise.complete.obs") rDZ<-cor(DZ$vA1con_T1,DZ$vA1con_T2,use="pairwise.complete.obs") # Combining results into one object # Corr2<-cbind(rMZ, rDZ) write.table(Corr2,file="corrs.csv",sep=",",row.names=FALSE,col.names=FALSE) #setup overall background to make transfer to slides prettier par(bg="white") #making barplot of twin correlations and putting a box around figure barplot(Corr2,ylim=c(0,1),col=c("cyan"), cex.axis=1.5,cex.names=1.5,space=0.1) # Same barplot with a color palette other than Base R # via the RColorBrewer package. See below for more # code to get started with RColorBrewer #barplot(Corr2[c(1,2)],ylim=c(0,1), # col=c("#1D91C0","#41B6C4"), # cex.axis=1,cex.names=1,space=0.1) #legend("topright",legend=c("rMZ", "rDZ"), pch=15, # pt.cex=1.5,col=c("#1D91C0","#41B6C4")) #box()