User Tools

Site Tools


keller_and_evans_lab:gscan_db_ga_p

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
keller_and_evans_lab:gscan_db_ga_p [2017/01/03 13:27]
hannah_young /* File Paths */
keller_and_evans_lab:gscan_db_ga_p [2019/10/31 12:28] (current)
66.249.87.23 ↷ Links adapted because of a move operation
Line 576: Line 576:
  
 ====== ARIC ====== ====== ARIC ======
- 
-(Hannah/Joyce to update this section following Framingham as a guide) 
- 
-===== ID Mapping ===== 
  
  
Line 864: Line 860:
 ### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped ### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped
  
- 
- 
- 
-===== Genotypes ===== 
  
  
Line 878: Line 870:
 ===== Phenotypes ===== ===== Phenotypes =====
  
-Description of phenotypes can be found here: {{file_mesa_phenotypes_-_final.pdf}}+Description of phenotypes can be found here: {{:file_mesa_phenotypes_-_final.pdf}}
  
  
 ====== eMERGE ====== ====== eMERGE ======
  
-(Hannah/Joyce to update following Framingham as a guide) 
  
  
 ===== Phenotypes ===== ===== Phenotypes =====
  
-Description of phenotypes can be found here{{file_emerge.pdf}}+ 
 + 
 +options(stringsAsFactors=F) 
 + 
 + 
 +### eMERGE is broken into different consent classes. We can conduct analyses on hmb, hmb-gso-nic, and  
 + 
 + 
 +emerge.hmb <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c1.MergedSet_Subject_Phenotypes.HMB.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) 
 +emerge.hmb.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/GenotypeFiles/matrix/c1.HMB/eMerge_660_11212012_c1.fam", header=FALSE, sep="\t", stringsAsFactors=F) 
 + 
 + 
 +emerge.hmb.gso.nic <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c3.MergedSet_Subject_Phenotypes.HM-B-GSO-NIC.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) 
 +emerge.hmb.gso.nic.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/GenotypeFiles/matrix/c3.HM-B-GSO-NIC/eMerge_660_11212012_c3.fam", header=FALSE, sep="\t", stringsAsFactors=F) 
 + 
 + 
 +emerge.hmb.gso <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c4.MergedSet_Subject_Phenotypes.HMB-GSO.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) 
 +emerge.hmb.gso.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/GenotypeFiles/matrix/c4.HMB-GSO/eMerge_660_11212012_c4.fam", header=FALSE, sep="\t", stringsAsFactors=F) 
 + 
 + 
 +### Merge all files above according to SUBJID, which is used in the 
 +### genotype files. 
 + 
 +emerge <- merge(emerge.hmb, emerge.hmb.gso, all=T) 
 +emerge <- merge(d, emerge.hmb.gso.nic, all=T) 
 + 
 +### SMOKING INITIATION 
 +### 
 +### The eMERGE variable name is SMOKING_STATUS 
 +###      C65108 = never smoker 
 +###      C67147 = current smoker 
 +###      C67148 = past smoker 
 +###      C67151 = Unknown if ever smoked 
 +### 
 +### Descriptives: 
 +### 
 +### table(emerge$SMOKING_STATUS) 
 +### 
 +### C65108 C67147 C67148 C67151  
 +###   2217   1736   3457   9635  
 + 
 +si <- emerge$SMOKING_STATUS 
 +si[si == "C67147" | si == "C67148"] <- 2 
 +si[si == "C65108"] <- 1 
 +si[si != 1 & si != 2] <- NA 
 + 
 +### SMOKING Cessation 
 +### 
 +### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. 
 + 
 +sc <- emerge$SMOKING_STATUS 
 +sc[sc == "C67147"] <- 2 
 +sc[sc == "C67148"] <- 1 
 +sc[sc != 1 & sc != 2] <- NA 
 + 
 + 
 +### eMERGE age variable is tricky because there is no obvious age at 
 +### assessment. We will use their "DECADE_BIRTH" as a terrible 
 +### approximation. 
 +### 1=1900-1919; 2=1920-1929, 3=1930-1939; 4=1940-1949; 5=1950-1959; 6=Unknown 
 +### 
 +### Descriptives: 
 +### 
 +### table(emerge$DECADE_BIRTH) 
 +### 
 +###      1    2    3    4    5    6    7    8    9   99  
 +###    612 2667 3533 4439 3127 1291  761  490   10  109  
 +birthyear <- emerge$DECADE_BIRTH 
 +birthyear[birthyear == "99"] <- NA 
 +birthyear[birthyear == "."] <- NA 
 + 
 + 
 +### SEX 
 +sex <- emerge$SEX 
 +sex[sex == "C46109"] <- 1 
 +sex[sex == "C46110"] <- 2 
 + 
 + 
 +### Scott decided not to correct for additional case-control variables 
 +### given what appears to be a highly complex sample and uncertainty 
 +### about the best course of action to account for disease status in 
 +### conducting smoking analyses. 
 + 
 +phenotypes <- data.frame(fid = emerge$SUBJID, 
 +                         iid = emerge$SUBJID, 
 +                         patid = "x", 
 +                         matid = "x", 
 +                         sex = sex, 
 +                         si = si, 
 +                         sc = sc) 
 + 
 +phenotypes[is.na(phenotypes)] <- "x" 
 + 
 +write.table(phenotypes, 
 +            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_phenotypes.ped", 
 +            row.names=F, 
 +            quote = F, 
 +            sep="\t"
 + 
 + 
 +covariates  <- data.frame(fid = emerge$SUBJID, 
 +                          iid = emerge$SUBJID, 
 +                          patid = "x", 
 +                          matid = "x", 
 +                          sex = sex, 
 +                          birthyear = birthyear)                          
 + 
 +covariates[is.na(covariates)] <- "x" 
 + 
 +write.table(covariates, 
 +            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_covariates.ped", 
 +            row.names=F, 
 +            quote = F, 
 +            sep="\t"
  
  
Line 899: Line 1004:
  
  
 +### Date: Feb 13 2017
 +### Author: Scott Vrieze
  
-options(stringsAs Factors=F)+options(stringsAsFactors=F)
  
-### Load in dataset ### +### Load in dataset ###
  
-ninds <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz"), header = T, sep="\t")+ninds <- read.table("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz", 
 +                    header = T, sep="\t")
  
 +### The file reads into R incorrectly because of a weird trailing tab
 +### character in the data file, so use the below code to shift column
 +### names to the correct column.
 +names(ninds)[1] <- "XXX"
 +ninds$dbGaP_Subject_ID <- row.names(ninds)
 +ninds$smokingStatus <- NULL
 +names(ninds) <- c(names(ninds)[2:17], "smokingStatus", "dbGaP_Subject_ID")
 +    
  
-### The file reads into R incorrectly, so use the below code to shift column names to the correct column.  +### subset the only variables needed 
-tmp <- colnames(ninds+pheno <- subset(ninds, select=c("subject_id", "smokingStatus", "age", "gender", "AFFECTION_STATUS"))
-tmp2 <- tmp[c(2:19)] +
-tmp2 <- c(tmp2, "tmp") +
-colnames(ninds<- tmp2+
  
-### subset the only variables needed  
-phenotypes <- c("smokingStatus", "age", "gender") 
-pheno <- ninds2[phenotypes] 
  
 ###————————————————————### ###————————————————————###
Line 922: Line 1032:
  
 ### NINDS variable is “smokingStatus” ### NINDS variable is “smokingStatus”
-### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.  +### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. 
-###  +### 
-### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as ###  more than 100 cigarettes in one's lifetime but no smoking within the last 30   ###  days; ’NEVER' defined as less than 100 cigarettes smoked in one's lifetime.+### ’CURRENT' defined as cigarette smoking within last 30 days, 
 +### 'FORMER' defined as ### more than 100 cigarettes in one's lifetime 
 +### but no smoking within the last 30 ### days; ’NEVER' defined as 
 +### less than 100 cigarettes smoked in one's lifetime.
 ### table(pheno$smokingStatus) ### table(pheno$smokingStatus)
 ### ###
-###        CURRENT  FORMER   NEVER      NO UNKNOWN  +###        CURRENT  FORMER   NEVER      NO UNKNOWN 
-###     63     727    1137    2397           251 +###     63     727    1137    2397           251
  
-pheno[smokingStatus == "CURRENT"smokingStatus == "FORMER", 4] <- 2 +si <- pheno$smokingStatus 
-pheno[pheno$smokingStatus == "NEVER"pheno$smokingStatus == NO”, 4] <- 1+si[si == "CURRENT"si == "FORMER"] <- 2 
 +si[si == "NEVER"   si == "NO"] <- 1 
 +si[si != "1" & si != "2"] <- NA 
 +si <- as.numeric(si)
  
-### table(pheno$V4)+### table(si)
 ### ###
-###    1    2  +###    1    2 
-### 2401 1864  +### 2401 1864
- +
-colnames(pheno)[4] <- "si"+
  
  
Line 946: Line 1060:
  
 ### NINDS variable is “smokingStatus” ### NINDS variable is “smokingStatus”
-### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.  +### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. 
-###  +### 
-### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as ###  more than 100 cigarettes in one's lifetime but no smoking within the last 30   ###  days; ’NEVER' defined as less than 100 cigarettes smoked in one's lifetime.+### ’CURRENT' defined as cigarette smoking within last 30 days, 
 +### 'FORMER' defined as more than 100 cigarettes in one's lifetime but 
 +### no smoking within the last 30 days; ’NEVER' defined as less than 
 +### 100 cigarettes smoked in one's lifetime.
 ### table(pheno$smokingStatus) ### table(pheno$smokingStatus)
 ### ###
-###        CURRENT  FORMER   NEVER      NO UNKNOWN  +###        CURRENT  FORMER   NEVER      NO UNKNOWN 
-###     63     727    1137    2397           251 +###     63     727    1137    2397           251
  
-pheno[pheno$smokingStatus == "CURRENT", 5] <- 2 +sc <- pheno$smokingStatus 
-pheno[pheno$smokingStatus == "FORMER", 5] <- 1+sc[sc == "CURRENT"] <- 2 
 +sc[sc == "FORMER"] <- 1 
 +sc[sc != "1" & sc != "2"] <- NA 
 +sc <- as.numeric(sc)
  
 ### table(pheno$V5) ### table(pheno$V5)
 ### ###
-###    1    2  +###    1    2 
-### 1137  727 +### 1137  727
  
-colnames(pheno)[5] <- "sc" 
  
 ###——————————### ###——————————###
Line 972: Line 1091:
 ### table(pheno$gender) ### table(pheno$gender)
 ### ###
-###    F    M  +###    F    M 
-### 2627 1952 +### 2627 1952
  
 +sex <- pheno$gender
 +sex[sex == "M"] <- 1
 +sex[sex == "F"] <- 2
 +sex <- as.numeric(sex)
  
-pheno[gender=="F", 6] <2 +###----------------### 
-pheno[gender=="M", 6] <1+### Write to files ### 
 +###----------------###
  
-colnames(pheno)[6] <- "sex"+### This study uses the "subject_id" ID field in the genotype files, 
 +### so use that here, instead of the dbGaP_Subject_ID 
 +phenotypes <- data.frame(fid = pheno$subject_id, 
 +                     iid = pheno$subject_id, 
 +                     patid = "x"
 +                     matid = "x", 
 +                     sex = sex, 
 +                     si = si, 
 +                     sc = sc)
  
-######################### +write.table(phenotypes, 
-######################### +            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_phenotypes.ped", 
-###### FORM TABLES ###### +            row.names=F, 
-######################### +            quote = F, 
-#########################+            sep="\t")
  
-cbind(pheno, ninds[,1:2]) -> new 
-unknown <- NA 
  
-### PHENOTYPE TABLE ###+covariates  <- data.frame(fid = pheno$subject_id, 
 +                    iid = pheno$subject_id, 
 +                    patid = "x", 
 +                    matid = "x", 
 +                    sex = sex, 
 +                    age = pheno$age, 
 +                    age2 = pheno$age^2, 
 +                    AFFECTION_STATUS = pheno$AFFECTION_STATUS)
  
-phenotype <- new[,c(7,8,6,4,5)] +write.table(covariates, 
-colnames(phenotype)[1] <- "iid+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_covariates.ped", 
 +            row.names=F, 
 +            quote = F
 +            sep="\t") ### save table
  
-### generate fid, matid, patid columns  
-phenotype$fid <- unknown 
-phenotype$matid <- unknown 
-phenotype$patid <- unkown  
-phenotype <- phenotype[,c(6,1,7,8,2,3,4,5)] ### re-order 
  
-phenotype <- replace(phenotype, is.na(phenotype), "x") ### generate x’s  
-write.table(phenotype, "/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/STROKE_gscan_ANCESTRY_phen.ped", row.names=F, quote = F, sep="\t") ### save table  
  
 +====== BEAGESS ======
  
-### COVARIATE TABLE ### 
  
-covariate  <- new[,c(7,8,6,2)] 
-covariate$age2 <- covariate$age^2  
-colnames(covariate)[1] <- "iid" 
  
-### generate fid, matid, patid columns  +===== Phenotypes =====
-covariate$fid <- unknown +
-covariate$matid <- unknown +
-covariate$patid <- unknown +
-covariate <- covariate[,c(6,1,7,8,2,3,4,5)] ### re-order+
  
-covariate <- replace(covariate, is.na(covariate), "x") ### generate x’s 
-write.table(covariate, "/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/STROKE_gscan_ANCESTRY_cov.ped", row.names=F, quote = F, sep="\t") ### save table 
  
  
-== BEAGESS ==+options(stringsAsFactors=F)
  
 +beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
  
-===Phenotypes===+genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F)
  
-beagess_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) 
-geno_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F) 
-geno_data <- geno_data[-c(6)] 
-geno_data$V3[geno_data$V3 =="0"] = "x" 
-geno_data$V4[geno_data$V4 =="0"] = "x" 
  
-### IMPORTANT: All columns in original phenotype file are shifted one column to the the right, so the label of the column does not match the data in that column!!! +### The file reads into R incorrectly because of a weird trailing tab 
-si <- beagess_data$BMI +### character in the data file, so use the below code to shift column 
-sc <- beagess_data$BMI +### names to the correct column. 
-age <- beagess_data$sex+names(beagess)[1] <- "XXX" 
 +beagess$dbGaP_Subject_ID <- row.names(beagess) 
 +beagess$assocEABEvsCO <- NULL 
 +names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID")
  
-beagess_data <- cbind(beagess_data, beagess_data[,6]) 
-colnames(beagess_data)[13] <- "sc" 
  
- +### SMOKING INITIATION
-### SMOKING INITIATION +
 ### ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
Line 1048: Line 1168:
 ###       -99 = not consented ###       -99 = not consented
 ###       -9 = Missing ###       -9 = Missing
-###        0 = Never +###        0 = Never
 ###        1 = Former ###        1 = Former
 ###        2 = Current ###        2 = Current
-###     +### 
-### Descriptives: +### Descriptives:
 ### ###
 ### > table(beagess_data$BMI) ### > table(beagess_data$BMI)
-###  +### 
-### -99   -9    0    1    2  +### -99   -9    0    1    2 
-### 494 2051 1515 2288  575 +### 494 2051 1515 2288  575
 ### ###
 ### > summary(beagess_data$BMI) ### > summary(beagess_data$BMI)
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +### Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-### -99.000  -9.000   0.000  -9.234   1.000   2.000 +### -99.000  -9.000   0.000  -9.234   1.000   2.000
  
-beagess_data$BMI[beagess_data$BMI == -99] = "x" +si <- beagess$cig_smk_status 
-beagess_data$BMI[beagess_data$BMI == -9] = "x" +si[si == 1 | si == 2] <2 
-beagess_data$BMI[beagess_data$BMI == 1= 2 +si[si == 0] <1 
-beagess_data$BMI[beagess_data$BMI == 0= 1+si[si != 1 & si != 2] <- NA
  
-### SMOKING Cessation +### SMOKING Cessation
 ### ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
Line 1075: Line 1195:
 ###       -99 = not consented ###       -99 = not consented
 ###       -9 = Missing ###       -9 = Missing
-###        0 = Never +###        0 = Never
 ###        1 = Former ###        1 = Former
 ###        2 = Current ###        2 = Current
-###  
-### Descriptives:  
 ### ###
-### > table(beagess_data$BMI) +### Descriptives: 
-###  +### table(beagess$cig_smk_status) 
-### -99   -9    0    1    2  +###  -99   -9    0    1    2  
-### 494 2051 1515 2288  575 +###  494 2051 1515 2288  575 
 ### ###
-### > summary(beagess_data$BMI) +### Current == 2 & Former == in GSCANThis is already the case for these data.
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +
-### -99.000  -9.000   0.000  -9.234   1.000   2.000 +
  
-beagess_data$sc[beagess_data$sc == -99] = "x" +sc <- beagess$cig_smk_status 
-beagess_data$sc[beagess_data$sc == -9] = "x" +sc[sc == 0 | sc == -9 sc == -99<- NA
-beagess_data$sc[beagess_data$sc == 0= "x"+
  
-### AGE  +### BEAGESS variable name is "agegpcat"
-### +
-### BEAGESS variable name is "agegpcat" and is listed under the column "sex".+
 ###   "Age in 5 year categories" ###   "Age in 5 year categories"
-###   Response options are integers 1 through 14 or -9. +###   Response options are integers 1 through 14 or -9.
 ###       -9 = Missing ###       -9 = Missing
 ###       1 = 15-29 years of age ###       1 = 15-29 years of age
Line 1115: Line 1228:
 ###       14 = 90-100 years of age ###       14 = 90-100 years of age
 ### ###
-### Descriptives: +### Descriptives:
 ### ###
-### > table(beagess_data$sex) +### > table(beagess_data$agegpcat)
-###  +
-### -9    1    2    3    4    5    6    7    8    9   10   11   12   13   14  +
-### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17 +
 ### ###
-### > summary(beagess_data$BMI) +### -9    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17 
-### -9.000   7.000   8.000   8.227  10.000  14.000 +### 
 +### This is fine, but change the -9 to NA 
 +beagess$agegpcat[beagess$agegpcat == -9] <- NA
  
-beagess_data$sex[beagess_data$sex== 1] = 22 
-beagess_data$sex[beagess_data$sex== 2] = 32 
-beagess_data$sex[beagess_data$sex== 3] = 37 
-beagess_data$sex[beagess_data$sex== 4] = 42 
-beagess_data$sex[beagess_data$sex== 5] = 47 
-beagess_data$sex[beagess_data$sex== 6] = 52 
-beagess_data$sex[beagess_data$sex== 7] = 57 
-beagess_data$sex[beagess_data$sex== 8] = 62 
-beagess_data$sex[beagess_data$sex== 9] = 67 
-beagess_data$sex[beagess_data$sex== 10] = 72 
-beagess_data$sex[beagess_data$sex== 11] = 77 
-beagess_data$sex[beagess_data$sex== 12] = 82 
-beagess_data$sex[beagess_data$sex== 13] = 87 
-beagess_data$sex[beagess_data$sex== 14] = 95 
-beagess_data$sex[beagess_data$sex== -9] = "x" 
-beagess_data$sex <- as.numeric(beagess_data$sex) 
  
-### rename SUBJID column to dbGaP_Subject_ID in genotype file so it matches phenotype file where SUBJID is misslabelled as dbGaP_Subject_ID 
-colnames(geno_data) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex") 
  
-### merge geno and pheno files +### looks like BEAGESS uses their own internal "SUBJECT_ID" in the 
-phen <- merge(geno_data,beagess_databy="dbGaP_Subject_ID", all =TRUE) +### genotype files, so we'll use that in our pedigree files 
-phen <- phen[,c(1:5,7,10,17)]+phenotypes <- data.frame(fid = beagess$SUBJECT_ID, 
 +                         iid = beagess$SUBJECT_ID, 
 +                         patid = "x", 
 +                         matid "x", 
 +                         sex = beagess$sex, 
 +                         si = si, 
 +                         sc = sc)
  
-phen <- phen[,c(2,1,3,4,5,6,7,8)] +phenotypes[is.na(phenotypes)] <- "x"
-colnames(phen) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "age","si","sc")+
  
 +write.table(phenotypes,
 +            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped",
 +            row.names=F,
 +            quote = F,
 +            sep="\t")
  
-phenotypes <- data.frame(famid=phen$famid, dbGaP_Subject_ID=phen$dbGaP_Subject_ID, patid=phen$patid, matid=phen$matid, sex=phen$sex, si=phen$si, sc=phen$sc, currentformersmoker=phen$sc, age=phen$age, age2=phen$age^2) 
-colnames(phenotypes) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "si", "sc", "currentformersmoker", "age", "age2") 
  
-pcs <- read.table("/rc_scratch/hayo0753/BEAGESS/BEAGESS_pcs_and_ancestrys.txt",head=TRUEstringsAsFactors=Fsep=" ") +covariates  <- data.frame(fid beagess$SUBJECT_ID, 
-colnames(pcs) [1] <- "dbGaP_Subject_ID"+                          iid beagess$SUBJECT_ID, 
 +                          patid = "x", 
 +                          matid = "x"
 +                          sex = beagess$sex, 
 +                          age = beagess$age, 
 +                          age2 = beagess$age^2, 
 +                          Barretts.esophagus.case_v_control = beagess$assocBEvsCO, 
 +                          Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO)
  
-gscan.phenotypes <- merge(phenotypes, pcs, by="dbGaP_Subject_ID", all=TRUE) +covariates[is.na(covariates)] <- "x"
-gscan.phenotypes[is.na(gscan.phenotypes)] <- "x"+
  
-### EUROPEANS - entire sample is EUR +write.table(covariates, 
-phenotypes.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "si", "sc")) +                        "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped", 
-write.table(phenotypes.EUR.pedfile="BEAGESS.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t") +                        row.names=F, 
- +                        quote = F, 
-covariates.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "age", "age2",  "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10")) +                        sep="\t"### save table                 
-write.table(covariates.EUR.ped, file="BEAGESS.EUR.covariates.ped", quote=F, col.names=T, row.names=F, sep="\t")+
  
  
Line 1177: Line 1284:
  
  
-===== File Paths =====+===== Phenotypes =====
  
  
-**HMB consent phenotypes** 
  
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.J +hmb <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.JHS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz", header=T, sep="\t", stringsAsFactors=F) 
-HS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz", header=T, sep="\t", stringsAsFactors=F+npu <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz", header=T, sep="\t", stringsAsFactors=F
 +phens <- rbind(hmb, npu)
  
-**NPU consent phenotypes**+hmb_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.fam", header=F, stringsAsFactors=F) 
 +npu_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.fam", header=F, stringsAsFactor=F) 
 +geno <- rbind(hmb_geno, npu_geno)
  
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1. +jhs_map <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.MULTI/JHS_subject_sample_map_8_10_2010.txt", header=T, sep="\t", stringsAsFactors=F
-c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz", header=T, sep="\t", stringsAsFactors=F+jhs_pcs <- read.table("/rc_scratch/hayo0753/JHS_files/JHS_ancestry_and_pcs.txt", header=T, stringsAsFactors=F)
  
-**HMB consent genotypes**+names(jhs_pcs) [2] <- "SUBJID" 
 +names(geno) <- c("SUBJID", "iid", "patid", "matid", "sex", "phen"
 +filtered_geno <- geno[(geno$SUBJID %in% phens$SUBJID),
 +filtered_phens <- phens[(phens$SUBJID %in% geno$SUBJID),]
  
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-cal +mapped_geno <merge(filtered_phens, filtered_geno, by="SUBJID", all.x=T) 
-ls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.fam"header=FstringsAsFactors=F+mapped_geno_pcs <merge(mapped_genojhs_pcs, by="SUBJID"all.x=T)
  
-**NPU consent genotypes**+covariates <- mapped_geno_pcs[c(1,69,70:72,5,75:84)] 
 +names(covariates) [1] <- "fid" 
 +names(covariates) [2] <- "iid" 
 +names(covariates) [6] <- "age" 
 + 
 +###--------------------### 
 +### Smoking initiation ### 
 +###--------------------### 
 +###  
 +### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline" 
 +###       current_smoker_baseline = Current smoking status at first participant visit 
 +###       former_smoker_baseline = Former smoking status at first participant visit 
 +### 
 +### Response options are 
 +###     0 - No 
 +###     1 - Yes 
 +### 
 +### table(si) 
 +### 
 +###    0    1    x 
 +### 1206 537  9 
 + 
 +current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID")) 
 +former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline", "SUBJID")) 
 +N <- nrow(mapped_geno_pcs) 
 +si <- rep(NA, N) 
 +for(i in 1:N){ 
 +  if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){ 
 +    si[i] <- "x" 
 +  } 
 +  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 0){ 
 +    si[i] <- "1" 
 +  } 
 +  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 1){ 
 +    si[i] <- 2 ### former smokers are coded as 2 
 +  } 
 + else if (current.smoker[i,1]  == 1 & former.smoker[i,1] == 0){ 
 +    si[i] <- 2 ### current smokers are coded as 2 
 +  }  
 +
 +mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs) 
 + 
 + 
 +###--------------------### 
 +### Smoking Cessation ### 
 +###--------------------### 
 +###  
 +### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline" 
 +###       current_smoker_baseline = Current smoking status at first participant visit 
 +###       former_smoker_baseline = Former smoking status at first participant visit 
 +### 
 +### Response options are 
 +###     0 - No 
 +###     1 - Yes 
 +### 
 +### table(sc) 
 +### 
 +###    0    1    x 
 +### 1206 537  9 
 + 
 + 
 +current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID")) 
 +former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline","SUBJID")) 
 +N <- nrow(mapped_geno_pcs) 
 +sc <- rep(NA, N) 
 +for(i in 1:N){ 
 +  if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){ 
 +    sc[i] <- "x" 
 +  } 
 +  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 0){ 
 +    sc[i] <- "x" 
 +  } 
 +  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 1){ 
 +    sc[i] <- 1 ### former smokers are coded as 1 
 +  } 
 + else if (current.smoker[i,1]  == 1 & former.smoker[i,1] == 0){ 
 +    sc[i] <- 2 ### current smokers are coded as 2 
 +  } 
 +
 +phenotypes <- cbind(sc, mapped_geno_pcs_phen)
  
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype +phenotypes <phenotypes[c(3,71,72:74,1:2)] 
--calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.fam", header=F, stringsAsFactor=F+names(phenotypes)[1] <- "fid"
  
-**Subject Sample Mapping File**+write.table(covariates, "AFR.JHS.covariates.ped", row.names=F, quote=F) 
 +write.table(phenotypes, "AFR.JHS.phenotypes.ped", row.names=F, quote=F)
  
-/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.M 
-ULTI/JHS_subject_sample_map_8_10_2010.txt", header=T, sep="\t", stringsAsFactors=F 
  
  
keller_and_evans_lab/gscan_db_ga_p.1483475257.txt.gz · Last modified: 2017/01/03 13:27 by hannah_young