getwd()

setwd()


#read data
D <- read.table("ozbmi2.txt"ť,header=TRUE)

#write data
write.table(D, "newdata.txt"ť,se="\t"ť)

#list the variables in D
	names(D)
# dimensions of D
	dim(D)
# print the first 10 rows of D
	head(D, n=10)
#referring to variables in D
	#format is Object$variable
	head(D$age, n=10)

#You can make new variables within an existing object 
	D$newage<- D$age*100
#Or overwrite a variable
	D$age<- D$age*100
#Or recode a variable
	#D$catage <- ifelse(D$age > 30, 
	c("older"), c("younger")) 

#Mean and variance
	mean(D$age, na.rm =TRUE)
	var(D$age , na.rm =TRUE)

#A bit more info
	summary(D$age)
	summary(D$age[which(D$agecat==1)])
#What about a categorical variable
	table(D$agecat)
	table(D$agecat,D$zyg)

#typing D$ is getting annoying so we can attach the data
	attach(D)
	table(agecat,zyg)
	#detach(D)
#Correlations anyone?
	cor(wt1,bmi1, use="complete")
	cor(ht1,bmi1, use="complete")


#Multiple Linear Regression
	fit <- lm(bmi1 ~ age + zyg, data=D)
	summary(fit) 
# Other useful functions 
	coefficients(fit) # model coefficients
	confint(fit, level=0.95) # CIs for model parameters 
	anova(fit) # anova table 
	vcov(fit) # covariance matrix for model parameters 


#Histogram
#basic
	hist(age)
#basic
	hist(age, breaks=12, col='red')
# Add labels 
	hist(age, breaks=12, col='red', xlab='age in years',main='Histogram of age')


#Kernal density plot
d <- density(age, na.rm = "TRUE") # returns the density data
plot(d) # plots the results

# make a png file to hold the plot
png("zygdensity.png")

	hist(age, breaks=12, col='red', xlab='age in years',main='Histogram of age')

# close the png file to allow viewing
dev.off()


# 4 figures arranged in 2 rows and 2 columns
attach(mtcars)
par(mfrow=c(2,2))
plot(wt,mpg, main="Scatterplot of wt vs. mpg")
plot(wt,disp, type="n", main="wt vs disp")
	lines(sort(wt),sort(disp)) 
hist(wt, main="Histogram of wt")
boxplot(wt, main="Boxplot of wt")

#Super basic simulation

N = 6000
maf = .03
es = .6
allele_counts = rbinom(n=N,prob=maf,size=2) # size=2 because we're diploid
phenotypes = rnorm(n=N,m=0,sd=1) + es*allele_counts

#Note that maf and effect size together will determine the proportion of overall variance explained by the variant. 
#Variance explained in the population is approximately equal to (es)2*maf. 
# For the above example, it should be about (.6)2*.03 = .0108, i.e. about 1% of variance explained.  
lm(phenotypes~allele_counts) 

###Simulate some correlated data


set.seed(200)
rs=.5
xy <- mvrnorm (1000, c(0,0), matrix(c(1,rs,rs,1),2,2))
testData <- xy
selVars <- c("X","Y")
dimnames(testData) <- list(NULL, selVars)
summary(testData)
cov(testData)