Differences

This shows you the differences between two versions of the page.

--- gscan_db_ga_p [2017/01/05 18:31] – /* Phenotypes */ hannah_young
+++ gscan_db_ga_p [2017/02/14 21:14] – /* Phenotypes */ scott
@@ Line 899: / Line 899: @@
+### Date: Feb 13 2017
+### Author: Scott Vrieze
-options(stringsAs Factors=F)
+options(stringsAsFactors=F)
 ### Load in dataset ###
-ninds <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz"), header = T, sep="\t")
+ninds <- read.table("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz",
+                    header = T, sep="\t")
+### The file reads into R incorrectly because of a weird trailing tab
+### character in the data file, so use the below code to shift column
+### names to the correct column.
+names(ninds)[1] <- "XXX"
+ninds$dbGaP_Subject_ID <- row.names(ninds)
+ninds$smokingStatus <- NULL
+names(ninds) <- c(names(ninds)[2:17], "smokingStatus", "dbGaP_Subject_ID")
-### The file reads into R incorrectly, so use the below code to shift column names to the correct column.
+### subset the only variables needed
-tmp <- colnames(ninds)
+pheno <- subset(ninds, select=c("subject_id", "smokingStatus", "age", "gender", "AFFECTION_STATUS"))
-tmp2 <- tmp[c(2:19)]
-tmp2 <- c(tmp2, "tmp")
-colnames(ninds) <- tmp2
-### subset the only variables needed
-phenotypes <- c("smokingStatus", "age", "gender")
-pheno <- ninds2[phenotypes]
 ###————————————————————###
@@ Line 922: / Line 927: @@
 ### NINDS variable is “smokingStatus”
 ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
 ###
-### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as ###  more than 100 cigarettes in one's lifetime but no smoking within the last 30   ###  days; ’NEVER' defined as less than 100 cigarettes smoked in one's lifetime.
+### ’CURRENT' defined as cigarette smoking within last 30 days,
+### 'FORMER' defined as ### more than 100 cigarettes in one's lifetime
+### but no smoking within the last 30 ### days; ’NEVER' defined as
+### less than 100 cigarettes smoked in one's lifetime.
 ### table(pheno$smokingStatus)
 ###
 ###        CURRENT  FORMER   NEVER      NO UNKNOWN
 ###     63     727    1137    2397       4     251
-pheno[smokingStatus == "CURRENT" | smokingStatus == "FORMER", 4] <- 2
+si <- pheno$smokingStatus
-pheno[pheno$smokingStatus == "NEVER" | pheno$smokingStatus == “NO”, 4] <- 1
+si[si == "CURRENT" | si == "FORMER"] <- 2
+si[si == "NEVER"   | si == "NO"] <- 1
+si[si != "1" & si != "2"] <- NA
+si <- as.numeric(si)
-### table(pheno$V4)
+### table(si)
 ###
 ###    1    2
 ### 2401 1864
-colnames(pheno)[4] <- "si"
@@ Line 946: / Line 955: @@
 ### NINDS variable is “smokingStatus”
 ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
 ###
-### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as ###  more than 100 cigarettes in one's lifetime but no smoking within the last 30   ###  days; ’NEVER' defined as less than 100 cigarettes smoked in one's lifetime.
+### ’CURRENT' defined as cigarette smoking within last 30 days,
+### 'FORMER' defined as more than 100 cigarettes in one's lifetime but
+### no smoking within the last 30 days; ’NEVER' defined as less than
+### 100 cigarettes smoked in one's lifetime.
 ### table(pheno$smokingStatus)
 ###
 ###        CURRENT  FORMER   NEVER      NO UNKNOWN
 ###     63     727    1137    2397       4     251
-pheno[pheno$smokingStatus == "CURRENT", 5] <- 2
+sc <- pheno$smokingStatus
-pheno[pheno$smokingStatus == "FORMER", 5] <- 1
+sc[sc == "CURRENT"] <- 2
+sc[sc == "FORMER"] <- 1
+sc[sc != "1" & sc != "2"] <- NA
+sc <- as.numeric(sc)
 ### table(pheno$V5)
 ###
 ###    1    2
 ### 1137  727
-colnames(pheno)[5] <- "sc"
 ###——————————###
@@ Line 972: / Line 986: @@
 ### table(pheno$gender)
 ###
 ###    F    M
 ### 2627 1952
+sex <- pheno$gender
+sex[sex == "M"] <- 1
+sex[sex == "F"] <- 2
+sex <- as.numeric(sex)
-pheno[gender=="F", 6] <- 2
+###----------------###
-pheno[gender=="M", 6] <- 1
+### Write to files ###
+###----------------###
-colnames(pheno)[6] <- "sex"
+### This study uses the "subject_id" ID field in the genotype files,
+### so use that here, instead of the dbGaP_Subject_ID
+phenotypes <- data.frame(fid = pheno$subject_id,
+                     iid = pheno$subject_id,
+                     patid = "x",
+                     matid = "x",
+                     sex = sex,
+                     si = si,
+                     sc = sc)
-#########################
+write.table(phenotypes,
-#########################
+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_phenotypes.ped",
-###### FORM TABLES ######
+            row.names=F,
-#########################
+            quote = F,
-#########################
+            sep="\t")
-cbind(pheno, ninds[,1:2]) -> new
-unknown <- NA
-### PHENOTYPE TABLE ###
+covariates  <- data.frame(fid = pheno$subject_id,
+                    iid = pheno$subject_id,
+                    patid = "x",
+                    matid = "x",
+                    sex = sex,
+                    age = pheno$age,
+                    age2 = pheno$age^2,
+                    AFFECTION_STATUS = pheno$AFFECTION_STATUS)
-phenotype <- new[,c(7,8,6,4,5)]
+write.table(covariates,
-colnames(phenotype)[1] <- "iid"
+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_covariates.ped",
+            row.names=F,
+            quote = F,
+            sep="\t") ### save table
-### generate fid, matid, patid columns
-phenotype$fid <- unknown
-phenotype$matid <- unknown
-phenotype$patid <- unkown
-phenotype <- phenotype[,c(6,1,7,8,2,3,4,5)] ### re-order
-phenotype <- replace(phenotype, is.na(phenotype), "x") ### generate x’s
-write.table(phenotype, "/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/STROKE_gscan_ANCESTRY_phen.ped", row.names=F, quote = F, sep="\t") ### save table
+====== BEAGESS ======
-### COVARIATE TABLE ###
-covariate  <- new[,c(7,8,6,2)]
-covariate$age2 <- covariate$age^2
-colnames(covariate)[1] <- "iid"
-### generate fid, matid, patid columns
+===== Phenotypes =====
-covariate$fid <- unknown
-covariate$matid <- unknown
-covariate$patid <- unknown
-covariate <- covariate[,c(6,1,7,8,2,3,4,5)] ### re-order
-covariate <- replace(covariate, is.na(covariate), "x") ### generate x’s
-write.table(covariate, "/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/STROKE_gscan_ANCESTRY_cov.ped", row.names=F, quote = F, sep="\t") ### save table
-== BEAGESS ==
+options(stringsAsFactors=F)
+beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
-===Phenotypes===
+genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F)
-beagess_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
-geno_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F)
-geno_data <- geno_data[-c(6)]
-geno_data$V3[geno_data$V3 =="0"] = "x"
-geno_data$V4[geno_data$V4 =="0"] = "x"
-### IMPORTANT: All columns in original phenotype file are shifted one column to the the right, so the label of the column does not match the data in that column!!!
+### The file reads into R incorrectly because of a weird trailing tab
-si <- beagess_data$BMI
+### character in the data file, so use the below code to shift column
-sc <- beagess_data$BMI
+### names to the correct column.
-age <- beagess_data$sex
+names(beagess)[1] <- "XXX"
+beagess$dbGaP_Subject_ID <- row.names(beagess)
+beagess$assocEABEvsCO <- NULL
+names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID")
-beagess_data <- cbind(beagess_data, beagess_data[,6])
-colnames(beagess_data)[13] <- "sc"
+### SMOKING INITIATION
-### SMOKING INITIATION
 ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
@@ Line 1048: / Line 1063: @@
 ###       -99 = not consented
 ###       -9 = Missing
 ###        0 = Never
 ###        1 = Former
 ###        2 = Current
 ###
 ### Descriptives:
 ###
 ### > table(beagess_data$BMI)
 ###
 ### -99   -9    0    1    2
 ### 494 2051 1515 2288  575
 ###
 ### > summary(beagess_data$BMI)
 ### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 ### -99.000  -9.000   0.000  -9.234   1.000   2.000
-beagess_data$BMI[beagess_data$BMI == -99] = "x"
+si <- beagess$cig_smk_status
-beagess_data$BMI[beagess_data$BMI == -9] = "x"
+si[si == 1 | si == 2] <- 2
-beagess_data$BMI[beagess_data$BMI == 1] = 2
+si[si == 0] <- 1
-beagess_data$BMI[beagess_data$BMI == 0] = 1
+si[si != 1 & si != 2] <- NA
 ### SMOKING Cessation
 ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
@@ Line 1075: / Line 1090: @@
 ###       -99 = not consented
 ###       -9 = Missing
 ###        0 = Never
 ###        1 = Former
 ###        2 = Current
-###
-### Descriptives:
 ###
-### > table(beagess_data$BMI)
+### Descriptives:
-###
+### table(beagess$cig_smk_status)
-### -99   -9    0    1    2
+###  -99   -9    0    1    2
-### 494 2051 1515 2288  575
+###  494 2051 1515 2288  575
 ###
-### > summary(beagess_data$BMI)
+### Current == 2 & Former == 1 in GSCAN. This is already the case for these data.
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
-### -99.000  -9.000   0.000  -9.234   1.000   2.000
-beagess_data$sc[beagess_data$sc == -99] = "x"
+sc <- beagess$cig_smk_status
-beagess_data$sc[beagess_data$sc == -9] = "x"
+sc[sc == 0 | sc == -9 | sc == -99] <- NA
-beagess_data$sc[beagess_data$sc == 0] = "x"
-### AGE
+### BEAGESS variable name is "agegpcat"
-###
-### BEAGESS variable name is "agegpcat" and is listed under the column "sex".
 ###   "Age in 5 year categories"
 ###   Response options are integers 1 through 14 or -9.
 ###       -9 = Missing
 ###       1 = 15-29 years of age
@@ Line 1115: / Line 1123: @@
 ###       14 = 90-100 years of age
 ###
 ### Descriptives:
 ###
-### > table(beagess_data$sex)
+### > table(beagess_data$agegpcat)
-###
-### -9    1    2    3    4    5    6    7    8    9   10   11   12   13   14
-### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17
 ###
-### > summary(beagess_data$BMI)
+### -9    1    2    3    4    5    6    7    8    9   10   11   12   13   14
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
+### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17
-### -9.000   7.000   8.000   8.227  10.000  14.000
+###
+### This is fine, but change the -9 to NA
+beagess$agegpcat[beagess$agegpcat == -9] <- NA
-beagess_data$sex[beagess_data$sex== 1] = 22
-beagess_data$sex[beagess_data$sex== 2] = 32
-beagess_data$sex[beagess_data$sex== 3] = 37
-beagess_data$sex[beagess_data$sex== 4] = 42
-beagess_data$sex[beagess_data$sex== 5] = 47
-beagess_data$sex[beagess_data$sex== 6] = 52
-beagess_data$sex[beagess_data$sex== 7] = 57
-beagess_data$sex[beagess_data$sex== 8] = 62
-beagess_data$sex[beagess_data$sex== 9] = 67
-beagess_data$sex[beagess_data$sex== 10] = 72
-beagess_data$sex[beagess_data$sex== 11] = 77
-beagess_data$sex[beagess_data$sex== 12] = 82
-beagess_data$sex[beagess_data$sex== 13] = 87
-beagess_data$sex[beagess_data$sex== 14] = 95
-beagess_data$sex[beagess_data$sex== -9] = "x"
-beagess_data$sex <- as.numeric(beagess_data$sex)
-### rename SUBJID column to dbGaP_Subject_ID in genotype file so it matches phenotype file where SUBJID is misslabelled as dbGaP_Subject_ID
-colnames(geno_data) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex")
-### merge geno and pheno files
+### looks like BEAGESS uses their own internal "SUBJECT_ID" in the
-phen <- merge(geno_data,beagess_data, by="dbGaP_Subject_ID", all =TRUE)
+### genotype files, so we'll use that in our pedigree files
-phen <- phen[,c(1:5,7,10,17)]
+phenotypes <- data.frame(fid = beagess$SUBJECT_ID,
+                         iid = beagess$SUBJECT_ID,
+                         patid = "x",
+                         matid = "x",
+                         sex = beagess$sex,
+                         si = si,
+                         sc = sc)
-phen <- phen[,c(2,1,3,4,5,6,7,8)]
+phenotypes[is.na(phenotypes)] <- "x"
-colnames(phen) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "age","si","sc")
+write.table(phenotypes,
+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped",
+            row.names=F,
+            quote = F,
+            sep="\t")
-phenotypes <- data.frame(famid=phen$famid, dbGaP_Subject_ID=phen$dbGaP_Subject_ID, patid=phen$patid, matid=phen$matid, sex=phen$sex, si=phen$si, sc=phen$sc, currentformersmoker=phen$sc, age=phen$age, age2=phen$age^2)
-colnames(phenotypes) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "si", "sc", "currentformersmoker", "age", "age2")
-pcs <- read.table("/rc_scratch/hayo0753/BEAGESS/BEAGESS_pcs_and_ancestrys.txt",head=TRUE, stringsAsFactors=F, sep=" ")
+covariates  <- data.frame(fid = beagess$SUBJECT_ID,
-colnames(pcs) [1] <- "dbGaP_Subject_ID"
+                          iid = beagess$SUBJECT_ID,
+                          patid = "x",
+                          matid = "x",
+                          sex = beagess$sex,
+                          age = beagess$age,
+                          age2 = beagess$age^2,
+                          Barretts.esophagus.case_v_control = beagess$assocBEvsCO,
+                          Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO)
-gscan.phenotypes <- merge(phenotypes, pcs, by="dbGaP_Subject_ID", all=TRUE)
+covariates[is.na(covariates)] <- "x"
-gscan.phenotypes[is.na(gscan.phenotypes)] <- "x"
-### EUROPEANS - entire sample is EUR
+write.table(covariates,
-phenotypes.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "si", "sc"))
+                        "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped",
-write.table(phenotypes.EUR.ped, file="BEAGESS.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")
+                        row.names=F,
+                        quote = F,
-covariates.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "age", "age2",  "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"))
+                        sep="\t") ### save table
-write.table(covariates.EUR.ped, file="BEAGESS.EUR.covariates.ped", quote=F, col.names=T, row.names=F, sep="\t")
@@ Line 1175: / Line 1177: @@
 ====== Jackson Heart Study ======
-===== File Paths =====
-**HMB consent phenotypes**
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.J
-HS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz
-**NPU consent phenotypes**
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.
-c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz
-**HMB consent genotypes**
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-cal
-ls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.[bed/bim/fam]
-**NPU consent genotypes**
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype
--calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.[bed/bim/fam]
-**Subject Sample Mapping File**
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.M
-ULTI/JHS_subject_sample_map_8_10_2010.txt
@@ Line 1233: / Line 1206: @@
 names(covariates) [2] <- "iid"
 names(covariates) [6] <- "age"
+###--------------------###
+### Smoking initiation ###
+###--------------------###
+###
+### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
+###       current_smoker_baseline = Current smoking status at first participant visit
+###       former_smoker_baseline = Former smoking status at first participant visit
+###
+### Response options are
+###     0 - No
+###     1 - Yes
+###
+### table(si)
+###
+###    0    1    x
+### 1206 537  9
 current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))
@@ Line 1253: / Line 1243: @@
 }
 mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs)
+###--------------------###
+### Smoking Cessation ###
+###--------------------###
+###
+### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
+###       current_smoker_baseline = Current smoking status at first participant visit
+###       former_smoker_baseline = Former smoking status at first participant visit
+###
+### Response options are
+###     0 - No
+###     1 - Yes
+###
+### table(sc)
+###
+###    0    1    x
+### 1206 537  9
 current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))