This shows you the differences between two versions of the page.
Both sides previous revision Previous revision | Next revision Both sides next revision | ||
keller_and_evans_lab:gscan_db_ga_p [2017/02/13 21:35] scott /* Phenotypes */ |
keller_and_evans_lab:gscan_db_ga_p [2017/02/14 21:14] scott /* Phenotypes */ |
||
---|---|---|---|
Line 1040: | Line 1040: | ||
- | beagess_data <- read.table("/ | + | options(stringsAsFactors=F) |
- | geno_data <- read.table("/ | + | |
- | geno_data <- geno_data[-c(6)] | + | |
- | geno_data$V3[geno_data$V3 ==" | + | |
- | geno_data$V4[geno_data$V4 ==" | + | |
- | ### IMPORTANT: All columns in original phenotype file are shifted one column to the the right, so the label of the column does not match the data in that column!!! | + | beagess |
- | si <- beagess_data$BMI | + | |
- | sc <- beagess_data$BMI | + | |
- | age <- beagess_data$sex | + | |
- | beagess_data | + | genos <- read.table("/ |
- | colnames(beagess_data)[13] <- " | + | |
- | ### SMOKING INITIATION | + | ### The file reads into R incorrectly because of a weird trailing tab |
+ | ### character in the data file, so use the below code to shift column | ||
+ | ### names to the correct column. | ||
+ | names(beagess)[1] <- " | ||
+ | beagess$dbGaP_Subject_ID <- row.names(beagess) | ||
+ | beagess$assocEABEvsCO <- NULL | ||
+ | names(beagess) <- c(names(beagess)[2: | ||
+ | |||
+ | |||
+ | ### SMOKING INITIATION | ||
### | ### | ||
### BEAGESS variable name is " | ### BEAGESS variable name is " | ||
Line 1062: | Line 1063: | ||
### -99 = not consented | ### -99 = not consented | ||
### -9 = Missing | ### -9 = Missing | ||
- | ### 0 = Never | + | ### 0 = Never |
### 1 = Former | ### 1 = Former | ||
### 2 = Current | ### 2 = Current | ||
- | ### | + | ### |
- | ### Descriptives: | + | ### Descriptives: |
### | ### | ||
### > table(beagess_data$BMI) | ### > table(beagess_data$BMI) | ||
- | ### | + | ### |
- | ### -99 | + | ### -99 |
- | ### 494 2051 1515 2288 575 | + | ### 494 2051 1515 2288 575 |
### | ### | ||
### > summary(beagess_data$BMI) | ### > summary(beagess_data$BMI) | ||
- | ### Min. 1st Qu. Median | + | ### Min. 1st Qu. Median |
- | ### -99.000 | + | ### -99.000 |
- | beagess_data$BMI[beagess_data$BMI | + | si <- beagess$cig_smk_status |
- | beagess_data$BMI[beagess_data$BMI | + | si[si == 1 | si == 2] <- 2 |
- | beagess_data$BMI[beagess_data$BMI == 1] = 2 | + | si[si == 0] <- 1 |
- | beagess_data$BMI[beagess_data$BMI == 0] = 1 | + | si[si != 1 & si != 2] <- NA |
- | ### SMOKING Cessation | + | ### SMOKING Cessation |
### | ### | ||
### BEAGESS variable name is " | ### BEAGESS variable name is " | ||
Line 1089: | Line 1090: | ||
### -99 = not consented | ### -99 = not consented | ||
### -9 = Missing | ### -9 = Missing | ||
- | ### 0 = Never | + | ### 0 = Never |
### 1 = Former | ### 1 = Former | ||
### 2 = Current | ### 2 = Current | ||
- | ### | ||
- | ### Descriptives: | ||
### | ### | ||
- | ### > table(beagess_data$BMI) | + | ### Descriptives: |
- | ### | + | ### table(beagess$cig_smk_status) |
- | ### -99 | + | ### -99 |
- | ### 494 2051 1515 2288 575 | + | ### 494 2051 1515 2288 575 |
### | ### | ||
- | ### > summary(beagess_data$BMI) | + | ### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. |
- | ### Min. 1st Qu. Median | + | |
- | ### -99.000 | + | |
- | beagess_data$sc[beagess_data$sc == -99] = " | + | sc <- beagess$cig_smk_status |
- | beagess_data$sc[beagess_data$sc == -9] = " | + | sc[sc == 0 | sc == -9 | sc == -99] <- NA |
- | beagess_data$sc[beagess_data$sc == 0] = " | + | |
- | ### AGE | + | ### BEAGESS variable name is " |
- | ### | + | |
- | ### BEAGESS variable name is " | + | |
### " | ### " | ||
- | ### | + | ### |
### -9 = Missing | ### -9 = Missing | ||
### 1 = 15-29 years of age | ### 1 = 15-29 years of age | ||
Line 1129: | Line 1123: | ||
### 14 = 90-100 years of age | ### 14 = 90-100 years of age | ||
### | ### | ||
- | ### Descriptives: | + | ### Descriptives: |
### | ### | ||
- | ### > table(beagess_data$sex) | + | ### > table(beagess_data$agegpcat) |
- | ### | + | |
- | ### -9 1 2 3 4 5 6 7 8 9 | + | |
- | ### 27 | + | |
### | ### | ||
- | ### > summary(beagess_data$BMI) | + | ### -9 1 2 3 4 5 6 7 8 9 |
- | ### Min. 1st Qu. Median | + | ### 27 |
- | ### -9.000 | + | ### |
- | + | ### This is fine, but change the -9 to NA | |
- | beagess_data$sex[beagess_data$sex== | + | beagess$agegpcat[beagess$agegpcat |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== | + | |
- | beagess_data$sex[beagess_data$sex== 13] = 87 | + | |
- | beagess_data$sex[beagess_data$sex== 14] = 95 | + | |
- | beagess_data$sex[beagess_data$sex== -9] = " | + | |
- | beagess_data$sex | + | |
- | ### rename SUBJID column to dbGaP_Subject_ID in genotype file so it matches phenotype file where SUBJID is misslabelled as dbGaP_Subject_ID | ||
- | colnames(geno_data) <- c(" | ||
- | ### merge geno and pheno files | ||
- | phen <- merge(geno_data, | ||
- | phen <- phen[, | ||
- | phen <- phen[,c(2, | + | ### looks like BEAGESS uses their own internal " |
- | colnames(phen) | + | ### genotype files, so we'll use that in our pedigree files |
+ | phenotypes | ||
+ | iid = beagess$SUBJECT_ID, | ||
+ | patid = "x", | ||
+ | matid = "x", | ||
+ | sex = beagess$sex, | ||
+ | si = si, | ||
+ | sc = sc) | ||
+ | phenotypes[is.na(phenotypes)] <- " | ||
- | phenotypes <- data.frame(famid=phen$famid, dbGaP_Subject_ID=phen$dbGaP_Subject_ID, patid=phen$patid, matid=phen$matid, sex=phen$sex, si=phen$si, sc=phen$sc, currentformersmoker=phen$sc, | + | write.table(phenotypes, |
- | colnames(phenotypes) <- c(" | + | "/ |
+ | row.names=F, | ||
+ | quote = F, | ||
+ | sep="\t") | ||
- | pcs <- read.table("/ | ||
- | colnames(pcs) [1] <- " | ||
- | gscan.phenotypes | + | covariates |
- | gscan.phenotypes[is.na(gscan.phenotypes)] <- " | + | iid = beagess$SUBJECT_ID, |
+ | patid = "x", | ||
+ | matid = " | ||
+ | sex = beagess$sex, | ||
+ | age = beagess$age, | ||
+ | age2 = beagess$age^2, | ||
+ | | ||
+ | Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO) | ||
- | ### EUROPEANS - entire sample | + | covariates[is.na(covariates)] |
- | phenotypes.EUR.ped | + | |
- | write.table(phenotypes.EUR.ped, | + | |
- | covariates.EUR.ped <- subset(gscan.phenotypes, | + | write.table(covariates, |
- | write.table(covariates.EUR.ped, file=" | + | |
+ | row.names=F, | ||
+ | quote = F, | ||
+ | | ||