getwd() setwd() #read data D <- read.table("ozbmi2.txt",header=TRUE) #write data write.table(D, "newdata.txt",se="\t") #list the variables in D names(D) # dimensions of D dim(D) # print the first 10 rows of D head(D, n=10) #referring to variables in D #format is Object$variable head(D$age, n=10) #You can make new variables within an existing object D$newage<- D$age*100 #Or overwrite a variable D$age<- D$age*100 #Or recode a variable #D$catage <- ifelse(D$age > 30, c("older"), c("younger")) #Mean and variance mean(D$age, na.rm =TRUE) var(D$age , na.rm =TRUE) #A bit more info summary(D$age) summary(D$age[which(D$agecat==1)]) #What about a categorical variable table(D$agecat) table(D$agecat,D$zyg) #typing D$ is getting annoying so we can attach the data attach(D) table(agecat,zyg) #detach(D) #Correlations anyone? cor(wt1,bmi1, use="complete") cor(ht1,bmi1, use="complete") #Multiple Linear Regression fit <- lm(bmi1 ~ age + zyg, data=D) summary(fit) # Other useful functions coefficients(fit) # model coefficients confint(fit, level=0.95) # CIs for model parameters anova(fit) # anova table vcov(fit) # covariance matrix for model parameters #Histogram #basic hist(age) #basic hist(age, breaks=12, col='red') # Add labels hist(age, breaks=12, col='red', xlab='age in years',main='Histogram of age') #Kernal density plot d <- density(age, na.rm = "TRUE") # returns the density data plot(d) # plots the results # make a png file to hold the plot png("zygdensity.png") hist(age, breaks=12, col='red', xlab='age in years',main='Histogram of age') # close the png file to allow viewing dev.off() # 4 figures arranged in 2 rows and 2 columns attach(mtcars) par(mfrow=c(2,2)) plot(wt,mpg, main="Scatterplot of wt vs. mpg") plot(wt,disp, type="n", main="wt vs disp") lines(sort(wt),sort(disp)) hist(wt, main="Histogram of wt") boxplot(wt, main="Boxplot of wt") #Super basic simulation N = 6000 maf = .03 es = .6 allele_counts = rbinom(n=N,prob=maf,size=2) # size=2 because we're diploid phenotypes = rnorm(n=N,m=0,sd=1) + es*allele_counts #Note that maf and effect size together will determine the proportion of overall variance explained by the variant. #Variance explained in the population is approximately equal to (es)2*maf. # For the above example, it should be about (.6)2*.03 = .0108, i.e. about 1% of variance explained. lm(phenotypes~allele_counts) ###Simulate some correlated data set.seed(200) rs=.5 xy <- mvrnorm (1000, c(0,0), matrix(c(1,rs,rs,1),2,2)) testData <- xy selVars <- c("X","Y") dimnames(testData) <- list(NULL, selVars) summary(testData) cov(testData)