keller_and_evans_lab:gscan_db_ga_p
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| gscan_db_ga_p [2017/01/06 01:30] – /* Phenotypes */ hannah_young | keller_and_evans_lab:gscan_db_ga_p [2019/10/31 18:28] (current) – ↷ Links adapted because of a move operation 66.249.87.23 | ||
|---|---|---|---|
| Line 576: | Line 576: | ||
| ====== ARIC ====== | ====== ARIC ====== | ||
| - | |||
| - | (Hannah/ | ||
| - | |||
| - | ===== ID Mapping ===== | ||
| Line 864: | Line 860: | ||
| ### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped | ### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped | ||
| - | |||
| - | |||
| - | |||
| - | ===== Genotypes ===== | ||
| Line 878: | Line 870: | ||
| ===== Phenotypes ===== | ===== Phenotypes ===== | ||
| - | Description of phenotypes can be found here: {{file_mesa_phenotypes_-_final.pdf}} | + | Description of phenotypes can be found here: {{:file_mesa_phenotypes_-_final.pdf}} |
| ====== eMERGE ====== | ====== eMERGE ====== | ||
| - | (Hannah/ | ||
| ===== Phenotypes ===== | ===== Phenotypes ===== | ||
| - | Description of phenotypes | + | |
| + | |||
| + | options(stringsAsFactors=F) | ||
| + | |||
| + | |||
| + | ### eMERGE is broken into different consent classes. We can conduct analyses on hmb, hmb-gso-nic, | ||
| + | |||
| + | |||
| + | emerge.hmb <- read.table("/ | ||
| + | emerge.hmb.genos <- read.table("/ | ||
| + | |||
| + | |||
| + | emerge.hmb.gso.nic <- read.table("/ | ||
| + | emerge.hmb.gso.nic.genos <- read.table("/ | ||
| + | |||
| + | |||
| + | emerge.hmb.gso <- read.table("/ | ||
| + | emerge.hmb.gso.genos <- read.table("/ | ||
| + | |||
| + | |||
| + | ### Merge all files above according to SUBJID, which is used in the | ||
| + | ### genotype files. | ||
| + | |||
| + | emerge <- merge(emerge.hmb, | ||
| + | emerge <- merge(d, emerge.hmb.gso.nic, | ||
| + | |||
| + | ### SMOKING INITIATION | ||
| + | ### | ||
| + | ### The eMERGE variable name is SMOKING_STATUS | ||
| + | ### C65108 = never smoker | ||
| + | ### C67147 = current smoker | ||
| + | ### C67148 = past smoker | ||
| + | ### C67151 = Unknown if ever smoked | ||
| + | ### | ||
| + | ### Descriptives: | ||
| + | ### | ||
| + | ### table(emerge$SMOKING_STATUS) | ||
| + | ### | ||
| + | ### C65108 C67147 C67148 C67151 | ||
| + | ### | ||
| + | |||
| + | si <- emerge$SMOKING_STATUS | ||
| + | si[si == " | ||
| + | si[si == " | ||
| + | si[si != 1 & si != 2] <- NA | ||
| + | |||
| + | ### SMOKING Cessation | ||
| + | ### | ||
| + | ### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. | ||
| + | |||
| + | sc <- emerge$SMOKING_STATUS | ||
| + | sc[sc == " | ||
| + | sc[sc == " | ||
| + | sc[sc != 1 & sc != 2] <- NA | ||
| + | |||
| + | |||
| + | ### eMERGE age variable is tricky because there is no obvious age at | ||
| + | ### assessment. We will use their " | ||
| + | ### approximation. | ||
| + | ### 1=1900-1919; | ||
| + | ### | ||
| + | ### Descriptives: | ||
| + | ### | ||
| + | ### table(emerge$DECADE_BIRTH) | ||
| + | ### | ||
| + | ### | ||
| + | ### | ||
| + | birthyear <- emerge$DECADE_BIRTH | ||
| + | birthyear[birthyear == " | ||
| + | birthyear[birthyear == " | ||
| + | |||
| + | |||
| + | ### SEX | ||
| + | sex <- emerge$SEX | ||
| + | sex[sex == " | ||
| + | sex[sex == " | ||
| + | |||
| + | |||
| + | ### Scott decided not to correct for additional case-control variables | ||
| + | ### given what appears to be a highly complex sample and uncertainty | ||
| + | ### about the best course of action to account for disease status in | ||
| + | ### conducting smoking analyses. | ||
| + | |||
| + | phenotypes <- data.frame(fid = emerge$SUBJID, | ||
| + | iid = emerge$SUBJID, | ||
| + | patid = " | ||
| + | matid = " | ||
| + | sex = sex, | ||
| + | si = si, | ||
| + | sc = sc) | ||
| + | |||
| + | phenotypes[is.na(phenotypes)] <- " | ||
| + | |||
| + | write.table(phenotypes, | ||
| + | "/ | ||
| + | row.names=F, | ||
| + | quote = F, | ||
| + | sep=" | ||
| + | |||
| + | |||
| + | covariates | ||
| + | iid = emerge$SUBJID, | ||
| + | patid = " | ||
| + | matid = " | ||
| + | sex = sex, | ||
| + | birthyear = birthyear) | ||
| + | |||
| + | covariates[is.na(covariates)] <- " | ||
| + | |||
| + | write.table(covariates, | ||
| + | "/ | ||
| + | row.names=F, | ||
| + | quote = F, | ||
| + | sep=" | ||
| Line 899: | Line 1004: | ||
| + | ### Date: Feb 13 2017 | ||
| + | ### Author: Scott Vrieze | ||
| - | options(stringsAs Factors=F) | + | options(stringsAsFactors=F) |
| - | ### Load in dataset ### | + | ### Load in dataset ### |
| - | ninds <- read.table(gzfile("/ | + | ninds <- read.table("/ |
| + | | ||
| + | ### The file reads into R incorrectly because of a weird trailing tab | ||
| + | ### character in the data file, so use the below code to shift column | ||
| + | ### names to the correct column. | ||
| + | names(ninds)[1] <- " | ||
| + | ninds$dbGaP_Subject_ID <- row.names(ninds) | ||
| + | ninds$smokingStatus <- NULL | ||
| + | names(ninds) <- c(names(ninds)[2: | ||
| + | | ||
| - | ### The file reads into R incorrectly, | + | ### subset |
| - | tmp <- colnames(ninds) | + | pheno <- subset(ninds, select=c(" |
| - | tmp2 <- tmp[c(2:19)] | + | |
| - | tmp2 <- c(tmp2, "tmp") | + | |
| - | colnames(ninds) <- tmp2 | + | |
| - | ### subset the only variables needed | ||
| - | phenotypes <- c(" | ||
| - | pheno <- ninds2[phenotypes] | ||
| ### | ### | ||
| Line 922: | Line 1032: | ||
| ### NINDS variable is “smokingStatus” | ### NINDS variable is “smokingStatus” | ||
| - | ### Variables are “NEVER”, | + | ### Variables are “NEVER”, |
| - | ### | + | ### |
| - | ### ’CURRENT' | + | ### ’CURRENT' |
| + | ### ' | ||
| + | ### but no smoking within the last 30 ### days; ’NEVER' | ||
| + | ### less than 100 cigarettes smoked in one's lifetime. | ||
| ### table(pheno$smokingStatus) | ### table(pheno$smokingStatus) | ||
| ### | ### | ||
| - | ### CURRENT | + | ### CURRENT |
| - | ### | + | ### |
| - | pheno[smokingStatus == " | + | si <- pheno$smokingStatus |
| - | pheno[pheno$smokingStatus | + | si[si == " |
| + | si[si == " | ||
| + | si[si != " | ||
| + | si <- as.numeric(si) | ||
| - | ### table(pheno$V4) | + | ### table(si) |
| ### | ### | ||
| - | ### 1 2 | + | ### 1 2 |
| - | ### 2401 1864 | + | ### 2401 1864 |
| - | + | ||
| - | colnames(pheno)[4] <- " | + | |
| Line 946: | Line 1060: | ||
| ### NINDS variable is “smokingStatus” | ### NINDS variable is “smokingStatus” | ||
| - | ### Variables are “NEVER”, | + | ### Variables are “NEVER”, |
| - | ### | + | ### |
| - | ### ’CURRENT' | + | ### ’CURRENT' |
| + | ### ' | ||
| + | ### no smoking within the last 30 days; ’NEVER' | ||
| + | ### 100 cigarettes smoked in one's lifetime. | ||
| ### table(pheno$smokingStatus) | ### table(pheno$smokingStatus) | ||
| ### | ### | ||
| - | ### CURRENT | + | ### CURRENT |
| - | ### | + | ### |
| - | pheno[pheno$smokingStatus == " | + | sc <- pheno$smokingStatus |
| - | pheno[pheno$smokingStatus | + | sc[sc == " |
| + | sc[sc == " | ||
| + | sc[sc != " | ||
| + | sc <- as.numeric(sc) | ||
| ### table(pheno$V5) | ### table(pheno$V5) | ||
| ### | ### | ||
| - | ### 1 2 | + | ### 1 2 |
| - | ### 1137 727 | + | ### 1137 727 |
| - | colnames(pheno)[5] <- " | ||
| ### | ### | ||
| Line 972: | Line 1091: | ||
| ### table(pheno$gender) | ### table(pheno$gender) | ||
| ### | ### | ||
| - | ### F M | + | ### F M |
| - | ### 2627 1952 | + | ### 2627 1952 |
| + | sex <- pheno$gender | ||
| + | sex[sex == " | ||
| + | sex[sex == " | ||
| + | sex <- as.numeric(sex) | ||
| - | pheno[gender==" | + | ###----------------### |
| - | pheno[gender==" | + | ### Write to files ### |
| + | ### | ||
| - | colnames(pheno)[6] | + | ### This study uses the " |
| + | ### so use that here, instead of the dbGaP_Subject_ID | ||
| + | phenotypes | ||
| + | iid = pheno$subject_id, | ||
| + | patid = "x", | ||
| + | matid = " | ||
| + | sex = sex, | ||
| + | si = si, | ||
| + | sc = sc) | ||
| - | ######################### | + | write.table(phenotypes, |
| - | ######################### | + | "/ |
| - | ###### FORM TABLES ###### | + | |
| - | ######################### | + | quote = F, |
| - | ######################### | + | |
| - | cbind(pheno, | ||
| - | unknown <- NA | ||
| - | ### PHENOTYPE TABLE ### | + | covariates |
| + | iid = pheno$subject_id, | ||
| + | patid = " | ||
| + | matid = " | ||
| + | sex = sex, | ||
| + | age = pheno$age, | ||
| + | age2 = pheno$age^2, | ||
| + | AFFECTION_STATUS = pheno$AFFECTION_STATUS) | ||
| - | phenotype <- new[,c(7,8,6,4,5)] | + | write.table(covariates, |
| - | colnames(phenotype)[1] <- "iid" | + | "/ |
| + | row.names=F, | ||
| + | quote = F, | ||
| + | sep="\t") ### save table | ||
| - | ### generate fid, matid, patid columns | ||
| - | phenotype$fid <- unknown | ||
| - | phenotype$matid <- unknown | ||
| - | phenotype$patid <- unkown | ||
| - | phenotype <- phenotype[, | ||
| - | phenotype <- replace(phenotype, | ||
| - | write.table(phenotype, | ||
| + | ====== BEAGESS ====== | ||
| - | ### COVARIATE TABLE ### | ||
| - | covariate | ||
| - | covariate$age2 <- covariate$age^2 | ||
| - | colnames(covariate)[1] <- " | ||
| - | ### generate fid, matid, patid columns | + | ===== Phenotypes ===== |
| - | covariate$fid <- unknown | + | |
| - | covariate$matid <- unknown | + | |
| - | covariate$patid <- unknown | + | |
| - | covariate <- covariate[, | + | |
| - | covariate <- replace(covariate, | ||
| - | write.table(covariate, | ||
| - | == BEAGESS == | + | options(stringsAsFactors=F) |
| + | beagess <- read.table("/ | ||
| - | ===Phenotypes=== | + | genos <- read.table("/ |
| - | beagess_data <- read.table("/ | ||
| - | geno_data <- read.table("/ | ||
| - | geno_data <- geno_data[-c(6)] | ||
| - | geno_data$V3[geno_data$V3 ==" | ||
| - | geno_data$V4[geno_data$V4 ==" | ||
| - | ### IMPORTANT: All columns in original phenotype | + | ### The file reads into R incorrectly because of a weird trailing tab |
| - | si <- beagess_data$BMI | + | ### character in the data file, so use the below code to shift column |
| - | sc <- beagess_data$BMI | + | ### names to the correct |
| - | age <- beagess_data$sex | + | names(beagess)[1] |
| + | beagess$dbGaP_Subject_ID | ||
| + | beagess$assocEABEvsCO <- NULL | ||
| + | names(beagess) | ||
| - | beagess_data <- cbind(beagess_data, | ||
| - | colnames(beagess_data)[13] <- " | ||
| - | + | ### SMOKING INITIATION | |
| - | ### SMOKING INITIATION | + | |
| ### | ### | ||
| ### BEAGESS variable name is " | ### BEAGESS variable name is " | ||
| Line 1048: | Line 1168: | ||
| ### -99 = not consented | ### -99 = not consented | ||
| ### -9 = Missing | ### -9 = Missing | ||
| - | ### 0 = Never | + | ### 0 = Never |
| ### 1 = Former | ### 1 = Former | ||
| ### 2 = Current | ### 2 = Current | ||
| - | ### | + | ### |
| - | ### Descriptives: | + | ### Descriptives: |
| ### | ### | ||
| ### > table(beagess_data$BMI) | ### > table(beagess_data$BMI) | ||
| - | ### | + | ### |
| - | ### -99 | + | ### -99 |
| - | ### 494 2051 1515 2288 575 | + | ### 494 2051 1515 2288 575 |
| ### | ### | ||
| ### > summary(beagess_data$BMI) | ### > summary(beagess_data$BMI) | ||
| - | ### Min. 1st Qu. Median | + | ### Min. 1st Qu. Median |
| - | ### -99.000 | + | ### -99.000 |
| - | beagess_data$BMI[beagess_data$BMI | + | si <- beagess$cig_smk_status |
| - | beagess_data$BMI[beagess_data$BMI | + | si[si == 1 | si == 2] <- 2 |
| - | beagess_data$BMI[beagess_data$BMI == 1] = 2 | + | si[si == 0] <- 1 |
| - | beagess_data$BMI[beagess_data$BMI == 0] = 1 | + | si[si != 1 & si != 2] <- NA |
| - | ### SMOKING Cessation | + | ### SMOKING Cessation |
| ### | ### | ||
| ### BEAGESS variable name is " | ### BEAGESS variable name is " | ||
| Line 1075: | Line 1195: | ||
| ### -99 = not consented | ### -99 = not consented | ||
| ### -9 = Missing | ### -9 = Missing | ||
| - | ### 0 = Never | + | ### 0 = Never |
| ### 1 = Former | ### 1 = Former | ||
| ### 2 = Current | ### 2 = Current | ||
| - | ### | ||
| - | ### Descriptives: | ||
| ### | ### | ||
| - | ### > table(beagess_data$BMI) | + | ### Descriptives: |
| - | ### | + | ### table(beagess$cig_smk_status) |
| - | ### -99 | + | ### -99 |
| - | ### 494 2051 1515 2288 575 | + | ### 494 2051 1515 2288 575 |
| ### | ### | ||
| - | ### > summary(beagess_data$BMI) | + | ### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. |
| - | ### Min. 1st Qu. Median | + | |
| - | ### -99.000 | + | |
| - | beagess_data$sc[beagess_data$sc == -99] = " | + | sc <- beagess$cig_smk_status |
| - | beagess_data$sc[beagess_data$sc == -9] = " | + | sc[sc == 0 | sc == -9 | sc == -99] <- NA |
| - | beagess_data$sc[beagess_data$sc == 0] = " | + | |
| - | ### AGE | + | ### BEAGESS variable name is " |
| - | ### | + | |
| - | ### BEAGESS variable name is " | + | |
| ### " | ### " | ||
| - | ### | + | ### |
| ### -9 = Missing | ### -9 = Missing | ||
| ### 1 = 15-29 years of age | ### 1 = 15-29 years of age | ||
| Line 1115: | Line 1228: | ||
| ### 14 = 90-100 years of age | ### 14 = 90-100 years of age | ||
| ### | ### | ||
| - | ### Descriptives: | + | ### Descriptives: |
| ### | ### | ||
| - | ### > table(beagess_data$sex) | + | ### > table(beagess_data$agegpcat) |
| - | ### | + | |
| - | ### -9 1 2 3 4 5 6 7 8 9 | + | |
| - | ### 27 | + | |
| ### | ### | ||
| - | ### > summary(beagess_data$BMI) | + | ### -9 1 2 3 4 5 6 7 8 9 |
| - | ### Min. 1st Qu. Median | + | ### 27 |
| - | ### -9.000 | + | ### |
| - | + | ### This is fine, but change the -9 to NA | |
| - | beagess_data$sex[beagess_data$sex== | + | beagess$agegpcat[beagess$agegpcat |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== | + | |
| - | beagess_data$sex[beagess_data$sex== 13] = 87 | + | |
| - | beagess_data$sex[beagess_data$sex== 14] = 95 | + | |
| - | beagess_data$sex[beagess_data$sex== -9] = " | + | |
| - | beagess_data$sex | + | |
| - | ### rename SUBJID column to dbGaP_Subject_ID in genotype file so it matches phenotype file where SUBJID is misslabelled as dbGaP_Subject_ID | ||
| - | colnames(geno_data) <- c(" | ||
| - | ### merge geno and pheno files | ||
| - | phen <- merge(geno_data, | ||
| - | phen <- phen[, | ||
| - | phen <- phen[,c(2, | + | ### looks like BEAGESS uses their own internal " |
| - | colnames(phen) | + | ### genotype files, so we'll use that in our pedigree files |
| + | phenotypes | ||
| + | iid = beagess$SUBJECT_ID, | ||
| + | patid = "x", | ||
| + | matid = "x", | ||
| + | sex = beagess$sex, | ||
| + | si = si, | ||
| + | sc = sc) | ||
| + | phenotypes[is.na(phenotypes)] <- " | ||
| - | phenotypes <- data.frame(famid=phen$famid, dbGaP_Subject_ID=phen$dbGaP_Subject_ID, patid=phen$patid, matid=phen$matid, sex=phen$sex, si=phen$si, sc=phen$sc, currentformersmoker=phen$sc, | + | write.table(phenotypes, |
| - | colnames(phenotypes) <- c(" | + | "/ |
| + | row.names=F, | ||
| + | quote = F, | ||
| + | sep="\t") | ||
| - | pcs <- read.table("/ | ||
| - | colnames(pcs) [1] <- " | ||
| - | gscan.phenotypes | + | covariates |
| - | gscan.phenotypes[is.na(gscan.phenotypes)] <- " | + | iid = beagess$SUBJECT_ID, |
| + | patid = "x", | ||
| + | matid = " | ||
| + | sex = beagess$sex, | ||
| + | age = beagess$age, | ||
| + | age2 = beagess$age^2, | ||
| + | | ||
| + | Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO) | ||
| - | ### EUROPEANS - entire sample | + | covariates[is.na(covariates)] |
| - | phenotypes.EUR.ped | + | |
| - | write.table(phenotypes.EUR.ped, | + | |
| - | covariates.EUR.ped <- subset(gscan.phenotypes, | + | write.table(covariates, |
| - | write.table(covariates.EUR.ped, file=" | + | |
| + | row.names=F, | ||
| + | quote = F, | ||
| + | | ||
| Line 1175: | Line 1282: | ||
| ====== Jackson Heart Study ====== | ====== Jackson Heart Study ====== | ||
| - | |||
| - | |||
| - | ===== File Paths ===== | ||
| - | |||
| - | |||
| - | **HMB consent phenotypes** | ||
| - | |||
| - | / | ||
| - | HS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz | ||
| - | |||
| - | **NPU consent phenotypes** | ||
| - | |||
| - | / | ||
| - | c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz | ||
| - | |||
| - | **HMB consent genotypes** | ||
| - | |||
| - | / | ||
| - | ls-matrixfmt.c3.HMB-IRB/ | ||
| - | |||
| - | **NPU consent genotypes** | ||
| - | |||
| - | / | ||
| - | -calls-matrixfmt.c1.HMB-IRB-NPU/ | ||
| - | |||
| - | **Subject Sample Mapping File** | ||
| - | |||
| - | / | ||
| - | ULTI/ | ||
| Line 1233: | Line 1311: | ||
| names(covariates) [2] <- " | names(covariates) [2] <- " | ||
| names(covariates) [6] <- " | names(covariates) [6] <- " | ||
| + | |||
| + | ### | ||
| + | ### Smoking initiation ### | ||
| + | ### | ||
| + | ### | ||
| + | ### JHSCare variables are " | ||
| + | ### | ||
| + | ### | ||
| + | ### | ||
| + | ### Response options are | ||
| + | ### 0 - No | ||
| + | ### 1 - Yes | ||
| + | ### | ||
| + | ### table(si) | ||
| + | ### | ||
| + | ### 0 1 x | ||
| + | ### 1206 537 9 | ||
| current.smoker <- subset(mapped_geno_pcs, | current.smoker <- subset(mapped_geno_pcs, | ||
| Line 1253: | Line 1348: | ||
| } | } | ||
| mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs) | mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs) | ||
| + | |||
| + | |||
| + | ### | ||
| + | ### Smoking Cessation ### | ||
| + | ### | ||
| + | ### | ||
| + | ### JHSCare variables are " | ||
| + | ### | ||
| + | ### | ||
| + | ### | ||
| + | ### Response options are | ||
| + | ### 0 - No | ||
| + | ### 1 - Yes | ||
| + | ### | ||
| + | ### table(sc) | ||
| + | ### | ||
| + | ### 0 1 x | ||
| + | ### 1206 537 9 | ||
| + | |||
| current.smoker <- subset(mapped_geno_pcs, | current.smoker <- subset(mapped_geno_pcs, | ||
| Line 1277: | Line 1391: | ||
| names(phenotypes)[1] <- " | names(phenotypes)[1] <- " | ||
| - | write.table(covariates, | + | write.table(covariates, |
| - | write.table(phenotypes, | + | write.table(phenotypes, |
keller_and_evans_lab/gscan_db_ga_p.1483666249.txt.gz · Last modified: by hannah_young
