Differences

This shows you the differences between two versions of the page.

--- gscan_db_ga_p [2017/02/14 21:14] – /* Phenotypes */ scott
+++ keller_and_evans_lab:gscan_db_ga_p [2019/10/31 12:28] (current) – ↷ Links adapted because of a move operation 66.249.87.23
@@ Line 576: / Line 576: @@
 ====== ARIC ======
-(Hannah/Joyce to update this section following Framingham as a guide)
-===== ID Mapping =====
@@ Line 864: / Line 860: @@
 ### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped
-===== Genotypes =====
@@ Line 878: / Line 870: @@
 ===== Phenotypes =====
-Description of phenotypes can be found here: {{file_mesa_phenotypes_-_final.pdf}}
+Description of phenotypes can be found here: {{:file_mesa_phenotypes_-_final.pdf}}
 ====== eMERGE ======
-(Hannah/Joyce to update following Framingham as a guide)
 ===== Phenotypes =====
-Description of phenotypes can be found here: {{file_emerge.pdf}}
+options(stringsAsFactors=F)
+### eMERGE is broken into different consent classes. We can conduct analyses on hmb, hmb-gso-nic, and
+emerge.hmb <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c1.MergedSet_Subject_Phenotypes.HMB.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
+emerge.hmb.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/GenotypeFiles/matrix/c1.HMB/eMerge_660_11212012_c1.fam", header=FALSE, sep="\t", stringsAsFactors=F)
+emerge.hmb.gso.nic <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c3.MergedSet_Subject_Phenotypes.HM-B-GSO-NIC.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
+emerge.hmb.gso.nic.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/GenotypeFiles/matrix/c3.HM-B-GSO-NIC/eMerge_660_11212012_c3.fam", header=FALSE, sep="\t", stringsAsFactors=F)
+emerge.hmb.gso <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c4.MergedSet_Subject_Phenotypes.HMB-GSO.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
+emerge.hmb.gso.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/GenotypeFiles/matrix/c4.HMB-GSO/eMerge_660_11212012_c4.fam", header=FALSE, sep="\t", stringsAsFactors=F)
+### Merge all files above according to SUBJID, which is used in the
+### genotype files.
+emerge <- merge(emerge.hmb, emerge.hmb.gso, all=T)
+emerge <- merge(d, emerge.hmb.gso.nic, all=T)
+### SMOKING INITIATION
+###
+### The eMERGE variable name is SMOKING_STATUS
+###      C65108 = never smoker
+###      C67147 = current smoker
+###      C67148 = past smoker
+###      C67151 = Unknown if ever smoked
+###
+### Descriptives:
+###
+### table(emerge$SMOKING_STATUS)
+###
+### C65108 C67147 C67148 C67151
+###   2217   1736   3457   9635
+si <- emerge$SMOKING_STATUS
+si[si == "C67147" | si == "C67148"] <- 2
+si[si == "C65108"] <- 1
+si[si != 1 & si != 2] <- NA
+### SMOKING Cessation
+###
+### Current == 2 & Former == 1 in GSCAN. This is already the case for these data.
+sc <- emerge$SMOKING_STATUS
+sc[sc == "C67147"] <- 2
+sc[sc == "C67148"] <- 1
+sc[sc != 1 & sc != 2] <- NA
+### eMERGE age variable is tricky because there is no obvious age at
+### assessment. We will use their "DECADE_BIRTH" as a terrible
+### approximation.
+### 1=1900-1919; 2=1920-1929, 3=1930-1939; 4=1940-1949; 5=1950-1959; 6=Unknown
+###
+### Descriptives:
+###
+### table(emerge$DECADE_BIRTH)
+###
+###   .    1    2    3    4    5    6    7    8    9   99
+###   6  612 2667 3533 4439 3127 1291  761  490   10  109
+birthyear <- emerge$DECADE_BIRTH
+birthyear[birthyear == "99"] <- NA
+birthyear[birthyear == "."] <- NA
+### SEX
+sex <- emerge$SEX
+sex[sex == "C46109"] <- 1
+sex[sex == "C46110"] <- 2
+### Scott decided not to correct for additional case-control variables
+### given what appears to be a highly complex sample and uncertainty
+### about the best course of action to account for disease status in
+### conducting smoking analyses.
+phenotypes <- data.frame(fid = emerge$SUBJID,
+                         iid = emerge$SUBJID,
+                         patid = "x",
+                         matid = "x",
+                         sex = sex,
+                         si = si,
+                         sc = sc)
+phenotypes[is.na(phenotypes)] <- "x"
+write.table(phenotypes,
+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_phenotypes.ped",
+            row.names=F,
+            quote = F,
+            sep="\t")
+covariates  <- data.frame(fid = emerge$SUBJID,
+                          iid = emerge$SUBJID,
+                          patid = "x",
+                          matid = "x",
+                          sex = sex,
+                          birthyear = birthyear)
+covariates[is.na(covariates)] <- "x"
+write.table(covariates,
+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_covariates.ped",
+            row.names=F,
+            quote = F,
+            sep="\t")