User Tools

Site Tools


keller_and_evans_lab:gscan_db_ga_p

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision Both sides next revision
keller_and_evans_lab:gscan_db_ga_p [2017/02/13 21:35]
scott /* Phenotypes */
keller_and_evans_lab:gscan_db_ga_p [2017/02/14 21:14]
scott /* Phenotypes */
Line 1040: Line 1040:
  
  
-beagess_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) +options(stringsAsFactors=F)
-geno_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F) +
-geno_data <- geno_data[-c(6)] +
-geno_data$V3[geno_data$V3 =="0"] = "x" +
-geno_data$V4[geno_data$V4 =="0"] = "x"+
  
-### IMPORTANT: All columns in original phenotype file are shifted one column to the the right, so the label of the column does not match the data in that column!!! +beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
-si <- beagess_data$BMI +
-sc <beagess_data$BMI +
-age <beagess_data$sex+
  
-beagess_data <- cbind(beagess_databeagess_data[,6]) +genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam"header=FALSEstringsAsFactors=F)
-colnames(beagess_data)[13] <- "sc"+
  
  
-### SMOKING INITIATION +### The file reads into R incorrectly because of a weird trailing tab 
 +### character in the data file, so use the below code to shift column 
 +### names to the correct column. 
 +names(beagess)[1] <- "XXX" 
 +beagess$dbGaP_Subject_ID <- row.names(beagess) 
 +beagess$assocEABEvsCO <- NULL 
 +names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID"
 + 
 + 
 +### SMOKING INITIATION
 ### ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
Line 1062: Line 1063:
 ###       -99 = not consented ###       -99 = not consented
 ###       -9 = Missing ###       -9 = Missing
-###        0 = Never +###        0 = Never
 ###        1 = Former ###        1 = Former
 ###        2 = Current ###        2 = Current
-###     +### 
-### Descriptives: +### Descriptives:
 ### ###
 ### > table(beagess_data$BMI) ### > table(beagess_data$BMI)
-###  +### 
-### -99   -9    0    1    2  +### -99   -9    0    1    2 
-### 494 2051 1515 2288  575 +### 494 2051 1515 2288  575
 ### ###
 ### > summary(beagess_data$BMI) ### > summary(beagess_data$BMI)
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +### Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-### -99.000  -9.000   0.000  -9.234   1.000   2.000 +### -99.000  -9.000   0.000  -9.234   1.000   2.000
  
-beagess_data$BMI[beagess_data$BMI == -99] = "x" +si <- beagess$cig_smk_status 
-beagess_data$BMI[beagess_data$BMI == -9] = "x" +si[si == 1 | si == 2] <2 
-beagess_data$BMI[beagess_data$BMI == 1= 2 +si[si == 0] <1 
-beagess_data$BMI[beagess_data$BMI == 0= 1+si[si != 1 & si != 2] <- NA
  
-### SMOKING Cessation +### SMOKING Cessation
 ### ###
 ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
Line 1089: Line 1090:
 ###       -99 = not consented ###       -99 = not consented
 ###       -9 = Missing ###       -9 = Missing
-###        0 = Never +###        0 = Never
 ###        1 = Former ###        1 = Former
 ###        2 = Current ###        2 = Current
-###  
-### Descriptives:  
 ### ###
-### > table(beagess_data$BMI) +### Descriptives: 
-###  +### table(beagess$cig_smk_status) 
-### -99   -9    0    1    2  +###  -99   -9    0    1    2  
-### 494 2051 1515 2288  575 +###  494 2051 1515 2288  575 
 ### ###
-### > summary(beagess_data$BMI) +### Current == 2 & Former == in GSCANThis is already the case for these data.
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +
-### -99.000  -9.000   0.000  -9.234   1.000   2.000 +
  
-beagess_data$sc[beagess_data$sc == -99] = "x" +sc <- beagess$cig_smk_status 
-beagess_data$sc[beagess_data$sc == -9] = "x" +sc[sc == 0 | sc == -9 sc == -99<- NA
-beagess_data$sc[beagess_data$sc == 0= "x"+
  
-### AGE  +### BEAGESS variable name is "agegpcat"
-### +
-### BEAGESS variable name is "agegpcat" and is listed under the column "sex".+
 ###   "Age in 5 year categories" ###   "Age in 5 year categories"
-###   Response options are integers 1 through 14 or -9. +###   Response options are integers 1 through 14 or -9.
 ###       -9 = Missing ###       -9 = Missing
 ###       1 = 15-29 years of age ###       1 = 15-29 years of age
Line 1129: Line 1123:
 ###       14 = 90-100 years of age ###       14 = 90-100 years of age
 ### ###
-### Descriptives: +### Descriptives:
 ### ###
-### > table(beagess_data$sex) +### > table(beagess_data$agegpcat)
-###  +
-### -9    1    2    3    4    5    6    7    8    9   10   11   12   13   14  +
-### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17 +
 ### ###
-### > summary(beagess_data$BMI) +### -9                              10   11   12   13   14 
-### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17 
-### -9.000   7.000   8.000   8.227  10.000  14.000  +### 
- +### This is fine, but change the -9 to NA 
-beagess_data$sex[beagess_data$sex== 1] = 22 +beagess$agegpcat[beagess$agegpcat == -9] <- NA
-beagess_data$sex[beagess_data$sex== 2] = 32 +
-beagess_data$sex[beagess_data$sex== 3] = 37 +
-beagess_data$sex[beagess_data$sex== 4] = 42 +
-beagess_data$sex[beagess_data$sex== 5] = 47 +
-beagess_data$sex[beagess_data$sex== 6] = 52 +
-beagess_data$sex[beagess_data$sex== 7] = 57 +
-beagess_data$sex[beagess_data$sex== 8] = 62 +
-beagess_data$sex[beagess_data$sex== 9] = 67 +
-beagess_data$sex[beagess_data$sex== 10] = 72 +
-beagess_data$sex[beagess_data$sex== 11] = 77 +
-beagess_data$sex[beagess_data$sex== 12] = 82 +
-beagess_data$sex[beagess_data$sex== 13] = 87 +
-beagess_data$sex[beagess_data$sex== 14] = 95 +
-beagess_data$sex[beagess_data$sex== -9] = "x" +
-beagess_data$sex <- as.numeric(beagess_data$sex)+
  
-### rename SUBJID column to dbGaP_Subject_ID in genotype file so it matches phenotype file where SUBJID is misslabelled as dbGaP_Subject_ID 
-colnames(geno_data) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex") 
  
-### merge geno and pheno files 
-phen <- merge(geno_data,beagess_data, by="dbGaP_Subject_ID", all =TRUE) 
-phen <- phen[,c(1:5,7,10,17)] 
  
-phen <- phen[,c(2,1,3,4,5,6,7,8)] +### looks like BEAGESS uses their own internal "SUBJECT_ID" in the 
-colnames(phen) <- c("famid""dbGaP_Subject_ID""patid", "matid", "sex""age","si","sc")+### genotype filesso we'll use that in our pedigree files 
 +phenotypes <- data.frame(fid = beagess$SUBJECT_ID, 
 +                         iid = beagess$SUBJECT_ID, 
 +                         patid = "x", 
 +                         matid = "x", 
 +                         sex = beagess$sex, 
 +                         si = si, 
 +                         sc = sc)
  
 +phenotypes[is.na(phenotypes)] <- "x"
  
-phenotypes <- data.frame(famid=phen$famiddbGaP_Subject_ID=phen$dbGaP_Subject_IDpatid=phen$patidmatid=phen$matidsex=phen$sex, si=phen$si, sc=phen$sc, currentformersmoker=phen$sc, age=phen$age, age2=phen$age^2) +write.table(phenotypes, 
-colnames(phenotypes) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "si", "sc", "currentformersmoker", "age", "age2")+            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped", 
 +            row.names=F, 
 +            quote F, 
 +            sep="\t")
  
-pcs <- read.table("/rc_scratch/hayo0753/BEAGESS/BEAGESS_pcs_and_ancestrys.txt",head=TRUE, stringsAsFactors=F, sep=" ") 
-colnames(pcs) [1] <- "dbGaP_Subject_ID" 
  
-gscan.phenotypes <- merge(phenotypespcsby="dbGaP_Subject_ID", all=TRUE) +covariates  <- data.frame(fid = beagess$SUBJECT_ID, 
-gscan.phenotypes[is.na(gscan.phenotypes)] <- "x"+                          iid = beagess$SUBJECT_ID, 
 +                          patid = "x", 
 +                          matid "x", 
 +                          sex = beagess$sex, 
 +                          age = beagess$age, 
 +                          age2 = beagess$age^2, 
 +                          Barretts.esophagus.case_v_control = beagess$assocBEvsCO, 
 +                          Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO)
  
-### EUROPEANS - entire sample is EUR +covariates[is.na(covariates)] <- "x"
-phenotypes.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "si", "sc")) +
-write.table(phenotypes.EUR.ped, file="BEAGESS.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")+
  
-covariates.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "age", "age2",  "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10")) +write.table(covariates, 
-write.table(covariates.EUR.pedfile="BEAGESS.EUR.covariates.ped", quote=F, col.names=Trow.names=F, sep="\t")+                        "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped", 
 +                        row.names=F, 
 +                        quote = F, 
 +                        sep="\t"### save table                 
  
  
keller_and_evans_lab/gscan_db_ga_p.txt · Last modified: 2019/10/31 12:28 by 66.249.87.23