rm(list = ls(all = TRUE)) options(stringsAsFactors=FALSE) library(qvalue) library(GenABEL) library(GWASTools) library(qqman) # #---------------Load data-------------------------- setwd("/home/jenny/EWAS/") load("Methylation_smoking_discordant_practical.RData") # we have loaded 4 data objects # ls() # #-----have a look at the phenotype data------------ head(cov_cursmok) head(cov_nonsmok) summary(cov_cursmok) # the data from the smoking twin are in the objects named _cursmok # the data from the non-smoking twin are in the objects named _nonsmok # the data are sorted by twin pair # Are there differences in white blood cell counts between the smoking and non-smoking twin? cellcounts <- c("Neut_Perc", "Lymph_Perc", "Mono_Perc","Eos_Perc","Baso_Perc") results <- matrix(nrow=length(cellcounts), ncol=4) for (i in 1:length(cellcounts)) { out <- t.test(cov_cursmok[,cellcounts[i]],cov_nonsmok[,cellcounts[i]],paired=T) results[i,1] <- out$estimate results[i,2] <- out$p.value results[i,3] <- out$conf.int[1] results[i,4] <- out$conf.int[2] } colnames(results) <- c("MeanDifference_current_minus_never","pvalue","95confint_L","95confint_H") rownames(results) <- cellcounts results # differences in white blood cell counts between cases and controls can be an important confounder in EWAS studies # The methylation data you will work with have been adjusted for white blood blood cell counts (and other covariates) # #-----have a look at the methylation data------------ Meth_cursmok[,1:2] # this is the methylation data of the smoking twins for the first 2 CpG sites (columns) Meth_nonsmok[,1:2] # this is the methylation data of the non-smoking twins for the first 2 CpG sites (columns) ncol(Meth_cursmok) # this is the number of methylation sites in the dataset nrow(Meth_cursmok) # this is the number of twin pairs # EWAS for smoking in discordant MZ twin pairs results <- matrix(nrow=ncol(Meth_cursmok), ncol=4) for (i in 1: ncol(Meth_cursmok)) { out <- t.test(Meth_cursmok[,i], Meth_nonsmok[,i], paired=T) results[i,1] <- out$estimate results[i,2] <- out$p.value results[i,3] <- out$conf.int[1] results[i,4] <- out$conf.int[2] } output <- data.frame(colnames(Meth_cursmok),results) colnames(output) <- c("cgid","MeanDifference_current_minus_never","pvalue","95confint_L","95confint_H") rownames(output) <- output$cgid output <- output[order(output$pvalue),] head(output) # How many methylation sites are associcated with my phenotype? # 1) Bonferroni correction bonfp <- 0.05/411169 # total number of autosomal methylation sites after QC bonfp length(which(output$pvalue < bonfp)) #number of methylation sites that are significant at FDR 5% output[which(output$pvalue < bonfp),] # 2) False discovery rate 0.05 (5%) output$qvalue <- qvalue(output$pvalue)$qvalue length(which(output$qvalue < 0.05)) # number of methylation sites that are significant at FDR 5% # Where in the genome are our genome-wide significant hits located? # Load illumina 450k annotation file load("manifest_small.RData") ls() head(manifest) dim(manifest) # note that the annotation file contains information on 485577 methylation sites. This is the total number of sites head(manifest) # MAPINFO genomic position (basepair position) of the interrogated site (in the case of CpG site: the location of the cytosine). # we are using the information from genome build 37 (HG19). # Extract these sites from the annotation data: manifest <- manifest[rownames(output),] ################################# Manhattan plot ########################################### BP <- manifest$MAPINFO SNP <- rownames(output) # note: we are NOT looking at SNPs, but the function Manhattan was originally designed for GWAS (SNP data) P <- as.numeric(output$pvalue) CHR <- as.numeric(as.character(manifest$CHR)) BP <- as.numeric(manifest$MAPINFO) res <- data.frame(SNP,CHR,BP,P) col=topo.colors(2) pvals <- data.frame(output$pvalue,output$qvalue) pvals <- pvals[order(pvals$output.qvalue),] FDRp <- max(pvals[which(pvals$output.qvalue < 0.05),"output.pvalue"] ) # this is the p-value threshold corresponding to FDR 5% pdf("Manhattan.pdf") manhattan(res,main="EWAS smoking",suggestiveline=FALSE, genomewideline = FALSE,col =col,cex = 0.1, cex.axis = 0.58,cex.main=0.8) abline(h = -log10(bonfp), col = "red") abline(h = -log10(FDRp), col = "grey") dev.off() bonfsig <- rownames(output)[which(output$pvalue < bonfp)] bonfsig # these are our bonferroni significant methylation sites pdf("Topsite_discordantpairs.pdf") picture1 <- data.frame(c(Meth_nonsmok[,bonfsig[1]],Meth_cursmok[,bonfsig[1]]),c(rep("1",40),rep("2", 40)),c(1:40,1:40)) colnames(picture1) <- c("Methylation", "Twin","ID") stripchart(Methylation ~ Twin, picture1, pch=19, vertical=TRUE, main='cg05575921, AHRR',ylab='Methylation level (residual)',glab='Twin',group.names=c("non-smoking twin","smoking twin")) for(ID in split(picture1, picture1$ID)) lines(Methylation ~ Twin, ID) dev.off() # have a look at the location of the methylation sites that are significantly differentially methylated between discordant twins output <- data.frame(manifest,output) output[which(output$pvalue < bonfp),] # In which gene is the top site (site with the lowest p-value) located and what is the function of this gene? # look it up in e.g. OMIM or NCBI