rm(list = ls(all = TRUE))
options(stringsAsFactors=FALSE)
library(qvalue)   
library(GenABEL)
library(GWASTools) 
library(qqman)
#
#---------------Load data--------------------------
setwd("/home/jenny/EWAS/")
load("Methylation_smoking_discordant_practical.RData")

# we have loaded 4 data objects
# 
ls()

#
#-----have a look at the phenotype data------------

head(cov_cursmok)
head(cov_nonsmok)
summary(cov_cursmok)

# the data from the smoking twin are in the objects named _cursmok
# the data from the non-smoking twin are in the objects named _nonsmok
# the data are sorted by twin pair 

                                                                                        
# Are there differences in white blood cell counts between the smoking and non-smoking twin?
cellcounts <- c("Neut_Perc",  "Lymph_Perc", "Mono_Perc","Eos_Perc","Baso_Perc")
results <- matrix(nrow=length(cellcounts), ncol=4)

for (i in 1:length(cellcounts)) {
out <- t.test(cov_cursmok[,cellcounts[i]],cov_nonsmok[,cellcounts[i]],paired=T)
results[i,1] <- out$estimate
results[i,2] <- out$p.value
results[i,3] <- out$conf.int[1]
results[i,4] <- out$conf.int[2]
}
colnames(results) <- c("MeanDifference_current_minus_never","pvalue","95confint_L","95confint_H")
rownames(results) <- cellcounts
results

# differences in white blood cell counts between cases and controls can be an important confounder in EWAS studies
# The methylation data you will work with have been adjusted for white blood blood cell counts (and other covariates)


#
#-----have a look at the methylation data------------

Meth_cursmok[,1:2]  # this is the methylation data of the smoking twins for the first 2 CpG sites (columns)
Meth_nonsmok[,1:2]  # this is the methylation data of the non-smoking twins for the first 2 CpG sites (columns)
ncol(Meth_cursmok)  # this is the number of methylation sites in the dataset
nrow(Meth_cursmok)  # this is the number of twin pairs


# EWAS for smoking in discordant MZ twin pairs
results <- matrix(nrow=ncol(Meth_cursmok), ncol=4)
for (i in 1: ncol(Meth_cursmok))
{
out <- t.test(Meth_cursmok[,i], Meth_nonsmok[,i], paired=T)
results[i,1] <- out$estimate
results[i,2] <- out$p.value
results[i,3] <- out$conf.int[1]
results[i,4] <- out$conf.int[2]
}
output <- data.frame(colnames(Meth_cursmok),results)
colnames(output) <- c("cgid","MeanDifference_current_minus_never","pvalue","95confint_L","95confint_H")
rownames(output) <- output$cgid
output <- output[order(output$pvalue),]
head(output)


# How many methylation sites are associcated with my phenotype?
# 1) Bonferroni correction

bonfp <- 0.05/411169 # total number of autosomal methylation sites after QC
bonfp                                        
length(which(output$pvalue < bonfp))        #number of methylation sites that are significant at FDR 5%
output[which(output$pvalue < bonfp),]

# 2) False discovery rate 0.05 (5%)
output$qvalue <- qvalue(output$pvalue)$qvalue
length(which(output$qvalue < 0.05))           # number of methylation sites that are significant at FDR 5%


#  Where in the genome are our genome-wide significant hits located?
# Load illumina 450k annotation file
load("manifest_small.RData")
ls()
head(manifest)
dim(manifest) # note that the annotation file contains information on 485577 methylation sites. This is the total number of sites
head(manifest)
# MAPINFO genomic position (basepair position) of the interrogated site (in the case of CpG site: the location of the cytosine).
# we are using the information from genome build 37 (HG19).

#  Extract these sites from the annotation data:
manifest <- manifest[rownames(output),]

#################################  Manhattan plot ###########################################
BP <- manifest$MAPINFO 
SNP <- rownames(output)      # note: we are NOT looking at SNPs, but the function Manhattan was originally designed for GWAS (SNP data)
P    <- as.numeric(output$pvalue)
CHR <- as.numeric(as.character(manifest$CHR))
BP  <- as.numeric(manifest$MAPINFO)
res <- data.frame(SNP,CHR,BP,P)
col=topo.colors(2)
pvals <- data.frame(output$pvalue,output$qvalue)
pvals <- pvals[order(pvals$output.qvalue),]
FDRp <- max(pvals[which(pvals$output.qvalue < 0.05),"output.pvalue"] ) # this is the p-value threshold corresponding to FDR 5%

pdf("Manhattan.pdf")
manhattan(res,main="EWAS smoking",suggestiveline=FALSE, genomewideline = FALSE,col =col,cex = 0.1, cex.axis = 0.58,cex.main=0.8)
abline(h = -log10(bonfp), col = "red")
abline(h = -log10(FDRp), col = "grey")
dev.off()

bonfsig <- rownames(output)[which(output$pvalue < bonfp)]
bonfsig  # these are our bonferroni significant methylation sites

pdf("Topsite_discordantpairs.pdf")
picture1 <- data.frame(c(Meth_nonsmok[,bonfsig[1]],Meth_cursmok[,bonfsig[1]]),c(rep("1",40),rep("2", 40)),c(1:40,1:40))
colnames(picture1) <- c("Methylation", "Twin","ID")
stripchart(Methylation ~ Twin, picture1, pch=19, vertical=TRUE, main='cg05575921, AHRR',ylab='Methylation level (residual)',glab='Twin',group.names=c("non-smoking twin","smoking twin"))
for(ID in split(picture1, picture1$ID))
lines(Methylation ~ Twin, ID)
dev.off()

# have a look at the location of the methylation sites that are significantly differentially methylated between discordant twins
output <- data.frame(manifest,output)
output[which(output$pvalue < bonfp),] 

# In which gene is the top site (site with the lowest p-value) located and what is the function of this gene?
# look it up in e.g. OMIM or NCBI