#GCTA Practical

######################################
#PART 1 - GET GRM FOR SIMULATED DATA USING GCTA

#First, set working directory to location where your files are on your computer
setwd("XXX")

list.files()  #this lists all files in your working directory. If you don't see the relevant files here (e.g., "simd.bed", then you need to change the setwd() command above until you do

#Run GCTA to get GRM
system("gcta --bfile simd --make-grm --thread-num 2 --out simd.gcta")

#Let's look at the pi-hats found from GCTA. Will take a minute or so to load
grm <- read.table(gzfile("simd.gcta.grm.gz"),header=FALSE)
names(grm) <- c('p1','p2','nsnp','pihat')
dim(grm)  #what are the dimensions of grm?
#should equal (n*(n+1))/2
n <- 2000
(n*(n+1))/2   #does it?

head(grm,20)  #look at first 20 rows

#remove within-person pi-hats (inbreeding coefficients) so we don't have a lot of ~1 values
grm2 <- grm[grm$p1 != grm$p2,]
dim(grm2)  #should be 2000 fewer rows.

head(grm2,20) #look at first 20 rows

#Look at the distribution of between-person pi-hats
hist(grm2$pihat,breaks=100)   #why are there big positive outliers?
sd(grm2$pihat)
mean(grm2$pihat)
######################################


######################################
#PART 2 - GET H2 ESTIMATES USING GCTA

#Run GCTA to get GRM - run this 3 times for each phenotype
system("gcta --grm simd.gcta --pheno simd.pheno --mpheno 1 --reml --thread-num 2 --out simd.results1")

system("gcta --grm simd.gcta --pheno simd.pheno --mpheno 2 --reml --thread-num 2 --out simd.results2")

system("gcta --grm simd.gcta --pheno simd.pheno --mpheno 3 --reml --thread-num 2 --out simd.results3")

#You can also look at the three *hsq files to get your answers using "less" in unix
#(or just note what's printed to screen)
######################################


######################################
#PART 3 - GET H2 ESTIMATES USING HE REGRESSION LIKE APPROACH
#here, we will pull the phenotype data in, standardize it, create a product term, and do regression

#Pull in phenotype data
phen <- read.table("simd.pheno",header=FALSE)

#look at it
head(phen)
summary(phen)

#scale it so that it has unit variance and mean=0
phen$z5 <- scale(phen$V5)
phen$z4 <- scale(phen$V4)
phen$z3 <- scale(phen$V3)

#get the IDs of each person
ids <- read.table("simd.gcta.grm.id")
ids$num <- 1:nrow(ids)

#this bit of code matches the phenotypes of each person with the grm file
#For time sake, we'll just look at phenotype 3
grm2$phen3.p1 <-phen$z5[match(grm2$p1,ids$num)]
grm2$phen3.p2 <-phen$z5[match(grm2$p2,ids$num)]

#get the product of the two scores
grm2$product <- grm2$phen3.p1*grm2$phen3.p2

head(grm2)

#Finally, look at regression - you need to fill in "Y" and "X" here
summary(lm(grm2$Y ~ grm2$X))

#Answer here is an ESTIMATE of h2, and is unlikely to ever be the exact same as the GCTA estimate (the EXPECTED answer is the same, but not the answer for any given instance). The SE of this estimate is very large (much larger than what is output in R). Do you know why the SE is wrong in the above equation?

######################################