User Tools

Site Tools


keller_and_evans_lab:gscan_db_ga_p

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revisionBoth sides next revision
gscan_db_ga_p [2017/02/13 21:35] – /* Phenotypes */ scottgscan_db_ga_p [2017/02/14 21:14] – /* Phenotypes */ scott
Line 1040: Line 1040:
  
  
-beagess_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) +options(stringsAsFactors=F)
-geno_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F) +
-geno_data <- geno_data[-c(6)] +
-geno_data$V3[geno_data$V3 =="0"] = "x" +
-geno_data$V4[geno_data$V4 =="0"] = "x"+
  
-### IMPORTANT: All columns in original phenotype file are shifted one column to the the right, so the label of the column does not match the data in that column!!! +beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
-si <- beagess_data$BMI +
-sc <beagess_data$BMI +
-age <beagess_data$sex+
  
-beagess_data <- cbind(beagess_databeagess_data[,6]) +genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam"header=FALSEstringsAsFactors=F)
-colnames(beagess_data)[13] <- "sc"+
  
  
-### SMOKING INITIATION +### The file reads into R incorrectly because of a weird trailing tab 
 +### character in the data file, so use the below code to shift column 
 +### names to the correct column. 
 +names(beagess)[1] <- "XXX" 
 +beagess$dbGaP_Subject_ID <- row.names(beagess) 
 +beagess$assocEABEvsCO <- NULL 
 +names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID"
 + 
 + 
 +### SMOKING INITIATION
 ### ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
Line 1062: Line 1063:
 ###       -99 = not consented ###       -99 = not consented
 ###       -9 = Missing ###       -9 = Missing
-###        0 = Never +###        0 = Never
 ###        1 = Former ###        1 = Former
 ###        2 = Current ###        2 = Current
-###     +### 
-### Descriptives: +### Descriptives:
 ### ###
 ### > table(beagess_data$BMI) ### > table(beagess_data$BMI)
-###  +### 
-### -99   -9    0    1    2  +### -99   -9    0    1    2 
-### 494 2051 1515 2288  575 +### 494 2051 1515 2288  575
 ### ###
 ### > summary(beagess_data$BMI) ### > summary(beagess_data$BMI)
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +### Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-### -99.000  -9.000   0.000  -9.234   1.000   2.000 +### -99.000  -9.000   0.000  -9.234   1.000   2.000
  
-beagess_data$BMI[beagess_data$BMI == -99] = "x" +si <- beagess$cig_smk_status 
-beagess_data$BMI[beagess_data$BMI == -9] = "x" +si[si == 1 | si == 2] <2 
-beagess_data$BMI[beagess_data$BMI == 1= 2 +si[si == 0] <1 
-beagess_data$BMI[beagess_data$BMI == 0= 1+si[si != 1 & si != 2] <- NA
  
-### SMOKING Cessation +### SMOKING Cessation
 ### ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
Line 1089: Line 1090:
 ###       -99 = not consented ###       -99 = not consented
 ###       -9 = Missing ###       -9 = Missing
-###        0 = Never +###        0 = Never
 ###        1 = Former ###        1 = Former
 ###        2 = Current ###        2 = Current
-###  
-### Descriptives:  
 ### ###
-### > table(beagess_data$BMI) +### Descriptives: 
-###  +### table(beagess$cig_smk_status) 
-### -99   -9    0    1    2  +###  -99   -9    0    1    2  
-### 494 2051 1515 2288  575 +###  494 2051 1515 2288  575 
 ### ###
-### > summary(beagess_data$BMI) +### Current == 2 & Former == in GSCANThis is already the case for these data.
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +
-### -99.000  -9.000   0.000  -9.234   1.000   2.000 +
  
-beagess_data$sc[beagess_data$sc == -99] = "x" +sc <- beagess$cig_smk_status 
-beagess_data$sc[beagess_data$sc == -9] = "x" +sc[sc == 0 | sc == -9 sc == -99<- NA
-beagess_data$sc[beagess_data$sc == 0= "x"+
  
-### AGE  +### BEAGESS variable name is "agegpcat"
-### +
-### BEAGESS variable name is "agegpcat" and is listed under the column "sex".+
 ###   "Age in 5 year categories" ###   "Age in 5 year categories"
-###   Response options are integers 1 through 14 or -9. +###   Response options are integers 1 through 14 or -9.
 ###       -9 = Missing ###       -9 = Missing
 ###       1 = 15-29 years of age ###       1 = 15-29 years of age
Line 1129: Line 1123:
 ###       14 = 90-100 years of age ###       14 = 90-100 years of age
 ### ###
-### Descriptives: +### Descriptives:
 ### ###
-### > table(beagess_data$sex) +### > table(beagess_data$agegpcat)
-###  +
-### -9    1    2    3    4    5    6    7    8    9   10   11   12   13   14  +
-### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17 +
 ### ###
-### > summary(beagess_data$BMI) +### -9                              10   11   12   13   14 
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17 
-### -9.000   7.000   8.000   8.227  10.000  14.000  +### 
- +### This is fine, but change the -9 to NA 
-beagess_data$sex[beagess_data$sex== 1] = 22 +beagess$agegpcat[beagess$agegpcat == -9] <- NA
-beagess_data$sex[beagess_data$sex== 2] = 32 +
-beagess_data$sex[beagess_data$sex== 3] = 37 +
-beagess_data$sex[beagess_data$sex== 4] = 42 +
-beagess_data$sex[beagess_data$sex== 5] = 47 +
-beagess_data$sex[beagess_data$sex== 6] = 52 +
-beagess_data$sex[beagess_data$sex== 7] = 57 +
-beagess_data$sex[beagess_data$sex== 8] = 62 +
-beagess_data$sex[beagess_data$sex== 9] = 67 +
-beagess_data$sex[beagess_data$sex== 10] = 72 +
-beagess_data$sex[beagess_data$sex== 11] = 77 +
-beagess_data$sex[beagess_data$sex== 12] = 82 +
-beagess_data$sex[beagess_data$sex== 13] = 87 +
-beagess_data$sex[beagess_data$sex== 14] = 95 +
-beagess_data$sex[beagess_data$sex== -9] = "x" +
-beagess_data$sex <- as.numeric(beagess_data$sex)+
  
-### rename SUBJID column to dbGaP_Subject_ID in genotype file so it matches phenotype file where SUBJID is misslabelled as dbGaP_Subject_ID 
-colnames(geno_data) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex") 
  
-### merge geno and pheno files 
-phen <- merge(geno_data,beagess_data, by="dbGaP_Subject_ID", all =TRUE) 
-phen <- phen[,c(1:5,7,10,17)] 
  
-phen <- phen[,c(2,1,3,4,5,6,7,8)] +### looks like BEAGESS uses their own internal "SUBJECT_ID" in the 
-colnames(phen) <- c("famid""dbGaP_Subject_ID""patid", "matid", "sex""age","si","sc")+### genotype filesso we'll use that in our pedigree files 
 +phenotypes <- data.frame(fid = beagess$SUBJECT_ID, 
 +                         iid = beagess$SUBJECT_ID, 
 +                         patid = "x", 
 +                         matid = "x", 
 +                         sex = beagess$sex, 
 +                         si = si, 
 +                         sc = sc)
  
 +phenotypes[is.na(phenotypes)] <- "x"
  
-phenotypes <- data.frame(famid=phen$famiddbGaP_Subject_ID=phen$dbGaP_Subject_IDpatid=phen$patidmatid=phen$matidsex=phen$sex, si=phen$si, sc=phen$sc, currentformersmoker=phen$sc, age=phen$age, age2=phen$age^2) +write.table(phenotypes, 
-colnames(phenotypes) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "si", "sc", "currentformersmoker", "age", "age2")+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped", 
 +            row.names=F, 
 +            quote F, 
 +            sep="\t")
  
-pcs <- read.table("/rc_scratch/hayo0753/BEAGESS/BEAGESS_pcs_and_ancestrys.txt",head=TRUE, stringsAsFactors=F, sep=" ") 
-colnames(pcs) [1] <- "dbGaP_Subject_ID" 
  
-gscan.phenotypes <- merge(phenotypespcsby="dbGaP_Subject_ID", all=TRUE) +covariates  <- data.frame(fid = beagess$SUBJECT_ID, 
-gscan.phenotypes[is.na(gscan.phenotypes)] <- "x"+                          iid = beagess$SUBJECT_ID, 
 +                          patid = "x", 
 +                          matid "x", 
 +                          sex = beagess$sex, 
 +                          age = beagess$age, 
 +                          age2 = beagess$age^2, 
 +                          Barretts.esophagus.case_v_control = beagess$assocBEvsCO, 
 +                          Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO)
  
-### EUROPEANS - entire sample is EUR +covariates[is.na(covariates)] <- "x"
-phenotypes.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "si", "sc")) +
-write.table(phenotypes.EUR.ped, file="BEAGESS.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")+
  
-covariates.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "age", "age2",  "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10")) +write.table(covariates, 
-write.table(covariates.EUR.pedfile="BEAGESS.EUR.covariates.ped", quote=F, col.names=Trow.names=F, sep="\t")+                        "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped", 
 +                        row.names=F, 
 +                        quote = F, 
 +                        sep="\t"### save table                 
  
  
keller_and_evans_lab/gscan_db_ga_p.txt · Last modified: by 66.249.87.23