======= UK Biobank ======= More information about the files used for [[uk_biobank|UKBiobank are here]]. In brief, we used the UK10K + 1kgp3 imputed vcfs provided by UKBionank and added in dosages w/ this python script: import gzip, argparse, re, os, datetime from subprocess import Popen, PIPE def add_dosage(pair): a, b = pair probs = b.split(b',') dose = float(probs[1]) + (float(probs[2]) * 2) return a + b':' + str(dose).encode('ascii') + b':' + b def gziplines(fname): f = Popen(['zcat', fname], stdout=PIPE) for line in f.stdout: yield line parser = argparse.ArgumentParser() parser.add_argument('inputVCF', help = 'The path to the VCF') args = parser.parse_args() flag = False for line in gziplines(args.inputVCF): if line.startswith(b'#'): os.write(1, line.rstrip() + b'\n') if not flag: os.write(1, b'##FORMAT=\n') os.write(1, b'##Dosages added using the script add.dosages.subprocess.py at ' + str(datetime.datetime.now()).encode('ascii') + b'\n') flag = True else: elements = re.split(b'\t|:', line.rstrip()) first8 = elements[:8] genotypes = elements[10:] form = b'GT:DS:GP' genotypes_split = zip(genotypes[::2], genotypes[1::2]) try: dose_genos = [add_dosage(pair) for pair in genotypes_split] except (ValueError, IndexError) as e: os.write(2, "\n" + line) os.write(2, line + "\n" + args.inputVCF + "\n\n") raise e os.write(1, b'\t'.join(first8) + b'\t' + form + b'\t' + b'\t'.join(dose_genos) + b'\n') ======= dbGaP Studies ======= ====== Framingham ====== ===== Phenotypes ===== options(stringsAsFactors=F) ### Offspring cohort off <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000030.v7.p10.c1.ex1_1s.HMB-IRB-MDS.txt.gz"), header=T, sep="\t") ### Generation 3 cohort g3 <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000074.v9.p10.c1.ex3_1s.HMB-IRB-MDS.txt.gz"), header=T, skip=10, sep="\t", fill=T) ###---------------------### ### Cigarettes per Day ### ###---------------------### ### Offspring Cohort Exam 1 ### Framingham variable name is "A102". ### "Usual number of cigarettes smoked (now or formerly)" ### Response option is an integer value ### ### table(off$A102) ### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ### 1208 50 29 44 26 47 31 16 30 3 175 1 20 1 9 94 ### 16 17 18 19 20 22 24 25 26 27 28 30 34 35 40 42 ### 5 6 50 1 556 10 3 39 1 1 2 214 1 15 147 2 ### 45 50 60 80 88 90 ### 4 21 19 1 14 2 ### ### summary(off$A102) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.00 0.00 7.00 12.11 20.00 90.00 8 cpd.off <- off$A102 cpd.off[cpd.off==0] <- NA ### Set those who smoke "0" cpd to NA cpd.off[cpd.off > 60] <- 60 ### Set those who report smoking >40 cpd to 40 cpd.off[cpd.off <= 5 & cpd.off >= 1] <- 1 cpd.off[cpd.off <= 15 & cpd.off >= 6] <- 2 cpd.off[cpd.off <= 25 & cpd.off >= 16] <- 3 cpd.off[cpd.off <= 35 & cpd.off >= 26] <- 4 cpd.off[cpd.off >= 36 & cpd.off <= 60] <- 5 cpd.off[cpd.off > 60 | is.na(cpd.off)] <- NA ### G3 cohort Exam 1 ### Framingham variable name is "G3A074 ### "IF EVER SMOKED CIGS REGULARLY: ON THE AVERAGE ### OF THE ENTIRE TIME YOU SMOKED, HOW MANY CIGARETTES ### DID YOU SMOKE PER DAY?" ### Response option is an integer value ### ### table(g3$G3A074) ### ### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ### 3180 57 14 15 20 30 8 13 7 2 122 2 16 3 2 60 ### 16 17 18 19 20 21 22 23 24 25 28 30 35 40 45 50 ### 3 3 10 1 287 2 1 1 1 12 1 53 1 47 1 12 ### 55 60 80 ### 1 6 1 ### ### summary(g3$G3A074) ### Min. 1st Qu. Median Mean 3rd Qu. Max. ### 0.000 0.000 0.000 3.528 0.000 80.000 cpd.g3 <- g3$G3A074 cpd.g3[cpd.g3==0] <- NA ### Set those who smoke "0" cpd to NA cpd.g3[cpd.g3 > 60] <- 60 ### Set those who report smoking >40 cpd to 40 cpd.g3[cpd.g3 <= 5 & cpd.g3 >= 1] <- 1 cpd.g3[cpd.g3 <= 15 & cpd.g3 >= 6] <- 2 cpd.g3[cpd.g3 <= 25 & cpd.g3 >= 16] <- 3 cpd.g3[cpd.g3 <= 35 & cpd.g3 >= 26] <- 4 cpd.g3[cpd.g3 >= 36 & cpd.g3 <= 60] <- 5 cpd.g3[cpd.g3 > 60 | is.na(cpd.g3)] <- NA ###--------------------### ### Smoking initiation ### ###--------------------### ### Offspring Cohort Exam 1 ### Framingham variable is "A99" ### "Smokes cigarettes: Yes(now)/No/Former" ### Response option is fill-in ### 0 is no, 1 is yes (now), 2 is former ### ### table(off$A99) ### ### 0 1 2 ### 1203 1126 566 si.off <- off$A99 si.off[si.off==1] <- 2 ### Set current smoker "1" to smoker "2" si.off[si.off==0] <- 1 ### Set never smoker "0" to nonsmoker "1" ### Former smoker "2" remains as a "2" as smoker ### G3 Cohort Exam 1 ### Framingham variables is "G3A070" ### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS ### LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF TOBACCO IN A ### LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)" ### Response options: ### "0" is no, "1" is yes, "0" is unknown ### ### table(g3$G3A070) ### 0 1 20 ### 2137 1629 1 si.g3 <- g3$G3A070 si.g3[si.g3==20] <- NA si.g3[si.g3==1] <- 2 si.g3[si.g3==0] <- 1 ###---------------------### ### Smoking cessation ### ###---------------------### ### Offspring Cohort Exam 1 ### Framingham variable is "A99" ### Smokes cigarettes: Yes(now)/No/Former" ### Response option is fill-in ### 0 is no, 1 is yes (now), 2 is former ### ### table(off$A99) ### ### 0 1 2 ### 1203 1126 566 sc.off <- off$A99 sc.off[sc.off==2] <- 3 ### temporarily set former smoker "2" as "3" sc.off[sc.off==1] <- 2 ### set current smoker "1" as current smoker "2" sc.off[sc.off==3] <- 1 ### set former smoker "3" as former smoker "1" sc.off[sc.off==0] <- NA ### code all non smokers "0" as NA ### G3 Cohort Exam 1 ### Framingham variables are “G3A070” and “G3A072” ### Response option is fill-in ### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF ###TOBACCO IN A LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.) ###“0” is no, “1” is yes, “20” is unknown ### G3A072 - IF EVER SMOKED CIGARETTES REGULARLY: DO YOU NOW SMOKE CIGARETTES (AS OF 1 MONTH AGO)? ###“0” is no or never smoked, “1” is yes ### ### table(g3$G3A070) ### 0 1 20 ### 2137 1629 1 ### ### table(g3$G3A072) ### 0 1 ### 3179 585 sc.g3 <- g3$G3A072 sc.g3[sc.g3==1] <- 2 ## These are our current smokers sc.g3[sc.g3==0] <- 1 sc.g3 <- ifelse(sc.g3==1 & (g3$G3A070 == 0 | is.na(g3$G3A070)), NA, sc.g3) ###---------------------### ### Age of initiation ### ###---------------------### ### Offspring Cohort Exam 1 ### Framingham variable "A100" ### Age started smoking cigarettes regularly\ ### Response option is an integer value, 88 is doesn't smoke regularly ### table(off$A100) ### ### 0 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ### 6 2 2 2 8 4 24 37 61 147 266 233 330 166 164 85 42 26 13 25 ### 26 27 28 29 30 31 33 34 35 37 38 39 40 44 45 46 51 88 ### 2 8 4 3 7 1 1 1 3 1 2 1 3 1 1 2 1 105 ### ai.off <- off$A100 ai.off[ai.off < 10] <- NA ### Set AI less than 10 as missing\ ai.off[ai.off > 35] <- NA ### Set AI greater than 35 as missing \ ### G3 Cohort Exam 1 ### Framingham variable "G3A075" ### IF EVER SMOKED CIGARETTES REGULARLY: HOW OLD WERE YOU WHEN YOU FIRST STARTED REGULAR CIGARETTE SMOKING? ### Response option is an integer value, 0 is never smoked ### table(g3$G3A075) ### ### 0 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ### 2138 1 2 2 3 5 14 52 108 159 152 277 193 256 98 107 ### 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 ### 51 31 19 13 31 6 4 6 2 13 2 1 1 4 1 4 ### 37 38 40 42 46 50 ### 2 2 1 1 1 2 ### ai.g3 <- g3$G3A075 ai.g3[ai.g3 < 10] <- NA ### Set AI less than 10 as missing ai.g3[ai.g3 > 35] <- NA ### Set AI greater than 35 as missing ###-------------------### ### Drinks per week ### ###-------------------### ### Offspring Cohort Exam 1 ### Framingham variables "A111", "A112", "A113" ### A111 - Beer (bottles, cans or glasses per week) ### A112 - Wine (glasses per week) ### A113 - Cocktails, highballs, straight drinks (# per week) ### Response options are integer values. ### table(off$A111) ### ### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18 ### 1452 524 171 131 78 49 125 34 33 4 46 68 21 21 5 13 ### 20 21 24 25 26 28 30 35 36 40 42 48 49 50 70 90 ### 18 7 48 5 1 6 6 2 2 4 4 6 1 2 1 1 ### ### table(off$A112) ### ### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18 ### 1329 919 218 105 69 41 22 64 25 3 25 11 30 2 2 2 ### 20 21 24 28 30 32 50 ### 7 4 2 3 1 2 1 ### ### table(off$A113) ### ### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18 ### 861 1094 267 157 96 64 59 65 31 9 49 16 57 13 2 1 ### 19 20 21 24 25 28 30 35 38 42 50 55 60 61 ### 1 12 10 4 2 2 6 2 1 1 1 1 1 1 ### dpw.off <- rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T) ### The above assumes all drinks are of equal ETOH content dpw.off[dpw.off == 0] <- NA dpw.off <- log(dpw.off) ### G3 Cohort Exam 1\ ### Framingham variables are "G3A115", "G3A116", "G3A119", "G3A120", ### "G3A123", "G3A124", "G3A127", "G3A128", ### "G3A131", "G3A132" ### G3A115 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER WEEK OVER THE PAST YEAR ### G3A116 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER MONTH OVER THE PAST YEAR ### G3A119 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR ### G3A120 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR ### G3A123 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR ### G3A124 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR ### G3A127 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER WEEK OVER THE PAST YEAR ### G3A128 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER MONTH OVERTHE PAST YEAR ### G3A131 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER WEEK OVER THE PAST YEAR ### G3A132 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER MONTH OVER THE PAST YEAR ### ### *Participant was allowed to report alcohol consumption in either drinks per week or drinks per ### month. Therefore, to calculate total alcohol consumption, you must use both number of drinks per ### week and number of drinks per month (E.G. DRINKS PER MONTH = SUM(OF (DRINKS PER WEEK)(DRINKS PER ### MONTH/4)). ### ### Response options are integer values. ### all <- subset(g3, select = c("G3A115", "G3A116", "G3A119", "G3A120", "G3A123", "G3A124", "G3A127", "G3A128", "G3A131", "G3A132")) names(all) <- c("beerpw", "beerpm", "wwinepw", "wwinepm", "rwinepw", "rwinepm", "liqpw", "liqpm", "othpw", "othpm") w <- all[,c(1,3,5,7,9)] m <- all[,c(2,4,6,8,10)]/4 dpw.tmp <- data.frame(beer = rowSums(cbind(w$beerpw, m$beerpm), na.rm=T), wwine = rowSums(cbind(w$wwinepw, m$wwinepm), na.rm=T), rwine = rowSums(cbind(w$rwinepw, m$rwinepm), na.rm=T), liq = rowSums(cbind(w$liqpw, m$liqpm), na.rm=T), oth = rowSums(cbind(w$othpw, m$othpm), na.rm=T)) ### WHITE WINE - convert from 4oz to 5oz drink dpw.tmp$wwine <- dpw.tmp$wwine*5/4 ### RED WINE - convert from 4oz to 5oz drink dpw.tmp$rwine <- dpw.tmp$rwine*5/4 ### LIQUOR - convert from 1.25 to 1.50 drink dpw.tmp$liq <- dpw.tmp$liq*1.5/1.25 ### OTHER DRINK is an unknown. leave as-is. dpw.g3 <- rowSums(dpw.tmp, na.rm=T) ### We have a few outliers, draw cuttoff at 70 drinks / week dpw.g3[dpw.g3 > 70] <- NA dpw.g3[dpw.g3 == 0] <- NA dpw.g3 <- log(dpw.g3 + .75) ###---------------------------### ### Drinker vs. non-drinker ### ###---------------------------### ### ### Use the variables already computed for dpw above. Assume ### anyone who is drinking less than 1 drink / month is a non-drinker x <- subset(off, select = c("A111", "A112", "A113")) index <- rep(NA, nrow(x)) for(i in 1:nrow(x)) { if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) { index[i] <- 1 } else { index[i] <- 0 } } dnd.off <- ifelse(rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T) == 0, 1, 2) dnd.off[index==1] <- NA x <- all index <- rep(NA, nrow(x)) for(i in 1:nrow(x)) { if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) { index[i] <- 1 } else { index[i] <- 0 } } dnd.g3 <- ifelse(rowSums(all, na.rm=T) == 0, 1, 2) dnd.g3[index==1] <- NA ###----------------------------### ### Binge drinking in everyone ### ### (Only available for G3) ### ###----------------------------### ### G3 Cohort Exam 1 ### Framingham variable is "G3A137" ### IF EVER CONSUMED ALCOHOL: WHAT WAS THE MAXIMUM NUMBER OF ### DRINKS YOU HAD IN A 24 HOUR PERIOD DURING THE PAST MONTH? ### Response option is an integer value ### ### summary(g3$G3A137) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.000 1.000 3.000 3.566 5.000 50.000 3 bde <- g3$G3A137 sex <- g3$G3A440 ### bde is defined differently for males & females ### sex == 1 is male, sex == 2 is female tmp <- rep(NA, length(bde)) ### temporary vector to hold results and avoid conflicts tmp[sex == 1 & bde < 5] <- 1 tmp[sex == 1 & bde >=5] <- 2 tmp[sex == 2 & bde < 4] <- 1 tmp[sex == 2 & bde >=4] <- 2 bde.g3 <- tmp ###-----------------------------------------### ### Binge-drinking (in lifetime drinkers) ### ### (Only available for G3) ### ###-----------------------------------------### ### G3 Cohort Exam 1 ### Framingham variable is "G3A112" ### HAVE YOU EVER CONSUMED ALCOHOLIC BEVERAGES (BEER, WINE, LIQUOR/SPIRITS)? \ ### 0 is no, 1 is yes lifednd <- g3$G3A112 ## No missings, so next command has no problem bdl.g3 <- bde.g3 bdl.g3 <- ifelse(lifednd == 0, NA, bde.g3) ################## ################## ### ### ### Covariates ### ### ### ################## ################## ### HEIGHT ### Offspring Cohort variable is “A51” ### Generation 3 Cohort variable is “G3A446” height.off <- off$A51 height.g3 <- g3$G3A446 ### WEIGHT ### Offspring Cohort variable is “A50” ### Generation 3 Cohort variable is “G3A444” weight.off <- off$A50 weight.g3 <- g3$G3A444 ###-------------### ### Age at exam ### ###-------------### ### Get BIRTHDATE birth <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000740.v7.p10.c1.birthyr_alls.HMB-IRB-MDS.txt.gz"), header=T, sep="\t") off.IDs <- off$dbGaP_Subject_ID g3.IDs <- g3$dbGaP_Subject_ID birth.off <- subset(birth, dbGaP_Subject_ID %in% off.IDs) birth.g3 <- subset(birth, dbGaP_Subject_ID %in% g3.IDs) ### True exam dates spanned 1971 - 1975 for offspring ### True exam dates spanned 2002 - 2005 for generation 3 ### A good-enough approximation is to take the middle ### year and subtract birth year to get age at exam ### Offspring Cohort off.age <- data.frame(dbGaP_Subject_ID = birth.off$dbGaP_Subject_ID, Age = 1973 - birth.off$birthyr) ### Framingham Cohort g3.age <- data.frame(dbGaP_Subject_ID = birth.g3$dbGaP_Subject_ID, Age = 2004 - birth.g3$birthyr) age <- rbind(off.age, g3.age) #################################### #################################### ### ### ### Create Phenotype Data Frames ### ### ### #################################### #################################### offspring <- data.frame(dbGaP_Subject_ID = off$dbGaP_Subject_ID, shareid = off$shareid, cpd = cpd.off, si = si.off, sc = sc.off, ai = ai.off, dpw = dpw.off, dnd = dnd.off, bde = rep(NA, nrow(off)), bdl = rep(NA, nrow(off)), height = height.off, weight = weight.off, cohort = rep(1, nrow(off))) generation3 <- data.frame(dbGaP_Subject_ID = g3$dbGaP_Subject_ID, shareid = g3$shareid, cpd = cpd.g3, si = si.g3, sc = sc.g3, ai = ai.g3, dpw = dpw.g3, dnd = dnd.g3, bde = bde.g3, bdl = bdl.g3, height = height.g3, weight = weight.g3, cohort = rep(2, nrow(g3))) tmp <- rbind(offspring, generation3) phenotypes <- merge(tmp, age, by="dbGaP_Subject_ID") phenotypes$age2 <- phenotypes$Age^2 ################## ### ID MAPPING ### ################## ### ID mapping file from dbGaP_Subject_ID to SAMPID ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F) ### Genotype files genotype.IDs <- read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F) names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype") length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID)) ## 6954 x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T) x <- x[,c(1:5,7)] ### There are many duplicates, which I can remove because I only want ### a single entry for every SAMPID-SUBJID x <- x[which(!duplicated(x$dbGaP_Subject_ID)),] almost.final <- merge(phenotypes, x, by="dbGaP_Subject_ID", all.x=T) #################################################### ### Bring in genetic PCs and ancestry categories ### #################################################### PCs <- read.table("/work/KellerLab/Zhen/FRAMINGHAM/PCA/FRAMINGHAM_pcs_and_ancestries.txt", header=T) names(PCs)[2] <- c("SAMPID") final <- merge(almost.final, PCs, by="SAMPID", all.x=T) phenotypes.ped <- subset(final, ancestry=="EUR", select=c("famid", "SAMPID", "patid", "matid", "sex", "cpd", "ai", "si", "sc", "dpw", "dnd", "bde", "bdl")) phenotypes.ped[is.na(phenotypes.ped)] <- "x" covariates.ped <- subset(final, ancestry=="EUR", select=c("famid", "SAMPID", "patid", "matid", "sex", "Age", "age2", "sc", "height", "weight", "cohort", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10")) covariates.ped[is.na(covariates.ped)] <- "x" write.table(phenotypes.ped, file="Framingham.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F) write.table(covariates.ped, file="Framingham.EUR.covariates.ped", quote=F, col.names=T, row.names=F) ===== Genotypes ===== We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam] ====== ARIC ====== ===== Phenotypes ===== phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",header=T,sep="\t",stringsAsFactors=F) phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp", "gender","v1age01", "anta01","anta04", "drnkr01", "hom29", 'hom35', "hom32", 'cigt01','evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98', 'cursmk01','forsmk01')) ### rename phenotypes to be readable names(phenotypes)[c(1,2,3,4,5,6)] <- c("geneva_id", "race", "sex", "age", "height", "weight") ### To connect sample ids to geneva ids, take SAMPID (the ID used in the genotype fam file, and SUBJID (aka geneva_id) ) id_map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz"),header=T,sep=",",stringsAsFactors=F)[,c(2,1)] names(id_map) <- c("SAMPID", "geneva_id") phenotypes <- merge(phenotypes, id_map, by="geneva_id", all.x=TRUE) ### import genotype data to get family info fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam", col.names = c("fam_id", "SAMPID", "patid", "matid", "sex", "dummy")) ### Replace 0's with "x" for rvTest preferred formatting fam_data$patid <- fam_data$matid <- fam_data$fam_id[fam_data$fam_id == 0] <- "x" ######################################## ###---- Derive GSCAN phenotypes -----### ######################################## ### DRINKER VERSUS NON-DRINKER ### ARIC variable name is "drnkr01". ### Combination of "Do you presently drink alcoholic beverages?" and "Have you ever consumed alcoholic beverages?" ### Response option for both questions are "yes" or "no", which are turned into the options below. ### Response options: ### 1 = Current Drinker ### 2 = Former Drinker ### 3 = Never Drinker ### 4 = Unknown ### ### Descriptives: ### table(phenotypes$drnkr01) ### 1 2 3 4 ### 7257 2309 3153 6 ### ### To obtain GSCAN "DND" collapse across Former and Never Drinkers ### and make "Non-Drinkers". Current Drinkers will be made "Drinkers" dnd <- phenotypes$drnkr01 dnd[dnd == 1] <- "Current Drinker" dnd[dnd == 2 | dnd == 3] <- 1 dnd[dnd == "Current Drinker"] <- 2 dnd[dnd == 4 | is.na(dnd)] <- "x" ### AGE OF INITIATION OF SMOKING ### ### ARIC variable name is "hom29". ### "How old were you when you first started regular cigarette smoking?" ### Response option is an integer value. ### ### Descriptives: ### ### > table(phenotypes$hom29) ### 0 1 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ### 19 1 2 10 11 15 22 32 65 44 154 187 302 659 941 715 ### 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 ### 1219 567 703 447 275 129 88 247 56 45 59 22 100 8 34 8 ### 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 ### 17 51 9 9 10 8 35 3 11 5 5 16 3 4 2 3 ### 50 51 52 55 57 59 60 62 ### 7 1 2 1 1 2 2 1 ### ### > summary(phenotypes$hom29) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.00 16.00 18.00 18.77 20.00 62.00 5377 ### ai <- phenotypes$hom29 ### remove ages older than 35 and younger than 10 ai[ai > 35 | ai < 10 | is.na(ai)] <- "x" ### CIGARETTES PER DAY ### ARIC variable name is "hom35" ### "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?" ### Response option is integer, or "0" for <1 cigarette per day ### ### > table(phenotypes$hom35) ### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ### 46 54 118 187 133 254 146 84 89 17 990 23 106 30 12 520 ### 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 ### 31 30 62 5 2559 1 5 10 5 193 9 3 7 6 771 3 ### 32 33 34 35 36 37 38 40 42 43 45 50 51 54 55 58 ### 1 1 1 44 3 1 1 572 1 3 14 72 1 1 3 1 ### 60 65 70 75 80 86 90 99 ### 100 1 4 2 10 1 1 3 ### > summary(phenotypes$hom35) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.00 10.00 20.00 19.67 24.00 99.00 5420 ### Responses are binned in accordance with the GSCAN Analysis Plan. cpd <- phenotypes$hom35 cpd[cpd <= 5 & cpd >= 1] <- 1 cpd[cpd <= 15 & cpd >= 6] <- 2 cpd[cpd <= 25 & cpd >= 16] <- 3 cpd[cpd <= 35 & cpd >= 26] <- 4 cpd[cpd >= 36 & cpd <= 60] <- 5 cpd[cpd > 60 | is.na(cpd)] <- "x" ### DRINKS PER WEEK ### ARIC variable names are "dtia96", "dtia97", and "dtia98" ### "dtia96" - "How many glasses of wine do you usualy have per week? (4oz. glasses; round down)." ### "dtia97" - "How many bottles of cans or beer do you usualy have per week? (12oz. bottles or cans; round down)." ### "dtia98" - "How many drinks of hard liquor do you usualy have per week? (4oz. glasses; round down)." ### Response option for all three is integer. ### ### Descriptives: ### ### >table(phenotypes$dtia96) ### 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16 ### 5226 844 461 255 147 90 75 50 27 3 34 1 15 28 9 2 ### 17 18 20 21 25 28 30 32 33 35 40 ### 1 3 7 5 1 2 3 1 1 1 1 ### ### >summary(phenotypes$dtia96) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.000 0.000 0.000 0.868 1.000 40.000 5478 ### ### >table(phenotypes$dtia97) ### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ### 4356 674 528 312 214 107 297 93 76 10 97 3 186 4 40 28 ### 16 18 19 20 21 22 23 24 25 28 30 32 33 35 36 40 ### 5 28 1 36 19 1 1 89 8 8 12 2 1 10 8 6 ### 42 45 48 49 50 56 60 63 70 72 80 92 ### 13 2 6 1 3 2 4 1 1 2 1 1 ### ### >summary(phenotypes$dtia97) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.000 0.000 0.000 2.609 2.000 92.000 5474 ### ### >table(phenotypes$dtia98) ### 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16 ### 5226 844 461 255 147 90 75 50 27 3 34 1 15 28 9 2 ### 17 18 20 21 25 28 30 32 33 35 40 ### 1 3 7 5 1 2 3 1 1 1 1 ### ### >summary(phenotypes$dtia96) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.000 0.000 0.000 0.868 1.000 40.000 5478 ### ### >table(phenotypes$dtia97) ### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ### 4356 674 528 312 214 107 297 93 76 10 97 3 186 4 40 28 ### 16 18 19 20 21 22 23 24 25 28 30 32 33 35 36 40 ### 5 28 1 36 19 1 1 89 8 8 12 2 1 10 8 6 ### 42 45 48 49 50 56 60 63 70 72 80 92 ### 13 2 6 1 3 2 4 1 1 2 1 1 ### ### >summary(phenotypes$dtia97) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.000 0.000 0.000 2.609 2.000 92.000 5474 ### ### >table(phenotypes$dtia98) ### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ### 4387 735 551 295 239 190 148 138 70 11 151 14 57 3 103 33 ### 16 17 18 20 21 24 25 26 27 28 30 32 33 34 35 36 ### 9 9 7 36 30 1 10 1 1 9 6 2 1 2 4 1 ### 39 40 44 45 47 48 50 51 52 54 55 56 63 64 75 77 ### 1 7 1 1 1 2 5 1 1 1 1 3 1 2 1 2 ### 90 99 ### 1 2 ### ### >summary(phenotypes$dtia98) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.000 0.000 0.000 2.227 2.000 99.000 5483 wine <- phenotypes$dtia96 # 1 drink = 4oz beer <- phenotypes$dtia97 # 1 drink = 12oz spirits <- phenotypes$dtia98 # 1 drink = 1.5oz ### muliply wine by 4 and divide wine by 5 to normalize to standard drink of 5 oz for wine ### Combine all alcohol types, left-anchor at 1, and log dpw <- log((wine*4/5 + beer + spirits) + 1) dpw[is.na(dpw)] <- "x" ### SMOKING INITIATION ### ARIC variable name is "evrsmk01" ### The variable checks answers to "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?". ### Response options are "yes" or "no". ### ### Descriptives: ### ### >table(phenotypes$evrsmk01) ### 0 1 ### 5328 7434 ### ### >summary(phenotypes$evrsmk01) ### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ### 0.0000 0.0000 1.0000 0.5825 1.0000 1.0000 9 si <- phenotypes$evrsmk01 si[si == 1] <- 2 si[si == 0] <- 1 si[is.na(si)] <- "x" ### SMOKING CESSATION ### ARIC variable names are "cursmk01" and "forsmk01" ### Both varaiables take into account the questions: "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?" ### Response options are "yes" or "no". ### Smoking inititation (si) is coded as "2" for "Smoker" if "yes" to "Have you ever smoked cigarettes?" ### If a subsequent "yes" to "Do you now smoke cigarettes?", smoking cessation (sc) is coded as "2" for "Current Smoker". ### If a subsequent "no" to "Do you now smoke cigarettes?", smoking cessation (sc) is coded as "1" for "Former Smoker". ### Smoking inititation (si) is coded as "1" for "Non Smoker" if "no" to "Have you ever smoked cigarettes?" current.smoker <- subset(phenotypes, select=c("cursmk01")) former.smoker <- subset(phenotypes, select=c("forsmk01")) N <- nrow(phenotypes) sc <- rep(NA, N) for(i in 1:N){ if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){ sc[i] <- NA } else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){ sc[i] <- NA } else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){ sc[i] <- 1 ### former smokers are coded as 1 } else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){ sc[i] <- 2 ### current smokers are coded as 2 } } sc[is.na(sc)] <- "x" ### Create dataframe with our new GSCAN variables N <- nrow(phenotypes) NAs <- rep("x", N) gscan.phenotypes <- data.frame(SAMPID = phenotypes$SAMPID, famid = NAs,geneva_id = phenotypes$geneva_id,patid = NAs,matid = NAs,sex = ifelse(phenotypes$sex == "M", 1, 2),cpd = cpd,ai = ai,si = si,sc = sc,dnd = dnd,dpw = dpw,age = phenotypes$age,age2 = phenotypes$age^2,height = phenotypes$height,weight = phenotypes$weight,currentformersmoker = sc) ### Merge in the SAMPID, which is used in the genotype files gscan.phenotypes <- merge(gscan.phenotypes, id_map, by="geneva_id", all.x=TRUE) ### Reorder phenotype file to make pedigree file consistent with genotype IDs gscan.phenotypes <- gscan.phenotypes[c(17,2,16,3:15)] colnames(gscan.phenotypes) [2] <- "SAMPID" ### Read in PCs and add to pedigree file, then write out to a phenotype and covariate file ### [ here read in PCs and merge into phenotype file (probably by the SAMPID) ] pcs <- read.table("/rc_scratch/hayo0753/aric/aric_ancestry_and_pcs", head=TRUE, stringsAsFactors=F) colnames(pcs) [1] <- "SAMPID" pcs <- merge(pcs, gscan.phenotypes, by="SAMPID", all.x=TRUE) ############# PRELIMINARY ##################### ### Write to file [NOTE TO HANNAH: will have to be changed once we ### have PCs and ancestry groups identified. PCs will have to be read ### in like with read.table() and we'll have to subset the dataset ### into European and African ancestry, and then write out one ### phenotype and covariate file per ancestry group. ### EUROPEANS phenotypes.EUR.ped <- subset(pcs, ancestry == "EUR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw")) write.table(phenotypes.EUR.ped, file="ARIC.EUR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t") covariates.EUR.ped <- subset(pcs, ancestry == "EUR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10")) write.table(covariates.EUR.ped, file="ARIC.EUR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t") phenotypes.AFR.ped <- subset(pcs, ancestry == "AFR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw")) write.table(phenotypes.AFR.ped, file="ARIC.AFR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t") covariates.AFR.ped <- subset(pcs, ancestry == "AFR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10")) write.table(covariates.AFR.ped, file="ARIC.AFR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t") ### Must remove duplicates from all files, use UNIX ### sort ARIC.EUR.phenotypes1.ped | uniq > ARIC.EUR.phenotypes.ped ### sort ARIC.EUR.covariates.ped | uniq > ARIC.EUR.covariates.ped ### sort ARIC.AFR.phenotypes1.ped | uniq > ARIC.AFR.phenotypes.ped ### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped ====== MESA ====== (Hannah/Joyce to update following Framingham as a guide) ===== Phenotypes ===== Description of phenotypes can be found here: {{:file_mesa_phenotypes_-_final.pdf}} ====== eMERGE ====== ===== Phenotypes ===== options(stringsAsFactors=F) ### eMERGE is broken into different consent classes. We can conduct analyses on hmb, hmb-gso-nic, and emerge.hmb <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c1.MergedSet_Subject_Phenotypes.HMB.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) emerge.hmb.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/GenotypeFiles/matrix/c1.HMB/eMerge_660_11212012_c1.fam", header=FALSE, sep="\t", stringsAsFactors=F) emerge.hmb.gso.nic <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c3.MergedSet_Subject_Phenotypes.HM-B-GSO-NIC.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) emerge.hmb.gso.nic.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/GenotypeFiles/matrix/c3.HM-B-GSO-NIC/eMerge_660_11212012_c3.fam", header=FALSE, sep="\t", stringsAsFactors=F) emerge.hmb.gso <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c4.MergedSet_Subject_Phenotypes.HMB-GSO.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) emerge.hmb.gso.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/GenotypeFiles/matrix/c4.HMB-GSO/eMerge_660_11212012_c4.fam", header=FALSE, sep="\t", stringsAsFactors=F) ### Merge all files above according to SUBJID, which is used in the ### genotype files. emerge <- merge(emerge.hmb, emerge.hmb.gso, all=T) emerge <- merge(d, emerge.hmb.gso.nic, all=T) ### SMOKING INITIATION ### ### The eMERGE variable name is SMOKING_STATUS ### C65108 = never smoker ### C67147 = current smoker ### C67148 = past smoker ### C67151 = Unknown if ever smoked ### ### Descriptives: ### ### table(emerge$SMOKING_STATUS) ### ### C65108 C67147 C67148 C67151 ### 2217 1736 3457 9635 si <- emerge$SMOKING_STATUS si[si == "C67147" | si == "C67148"] <- 2 si[si == "C65108"] <- 1 si[si != 1 & si != 2] <- NA ### SMOKING Cessation ### ### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. sc <- emerge$SMOKING_STATUS sc[sc == "C67147"] <- 2 sc[sc == "C67148"] <- 1 sc[sc != 1 & sc != 2] <- NA ### eMERGE age variable is tricky because there is no obvious age at ### assessment. We will use their "DECADE_BIRTH" as a terrible ### approximation. ### 1=1900-1919; 2=1920-1929, 3=1930-1939; 4=1940-1949; 5=1950-1959; 6=Unknown ### ### Descriptives: ### ### table(emerge$DECADE_BIRTH) ### ### . 1 2 3 4 5 6 7 8 9 99 ### 6 612 2667 3533 4439 3127 1291 761 490 10 109 birthyear <- emerge$DECADE_BIRTH birthyear[birthyear == "99"] <- NA birthyear[birthyear == "."] <- NA ### SEX sex <- emerge$SEX sex[sex == "C46109"] <- 1 sex[sex == "C46110"] <- 2 ### Scott decided not to correct for additional case-control variables ### given what appears to be a highly complex sample and uncertainty ### about the best course of action to account for disease status in ### conducting smoking analyses. phenotypes <- data.frame(fid = emerge$SUBJID, iid = emerge$SUBJID, patid = "x", matid = "x", sex = sex, si = si, sc = sc) phenotypes[is.na(phenotypes)] <- "x" write.table(phenotypes, "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_phenotypes.ped", row.names=F, quote = F, sep="\t") covariates <- data.frame(fid = emerge$SUBJID, iid = emerge$SUBJID, patid = "x", matid = "x", sex = sex, birthyear = birthyear) covariates[is.na(covariates)] <- "x" write.table(covariates, "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_covariates.ped", row.names=F, quote = F, sep="\t") ====== Stroke ====== ===== Phenotypes ===== ### Date: Feb 13 2017 ### Author: Scott Vrieze options(stringsAsFactors=F) ### Load in dataset ### ninds <- read.table("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz", header = T, sep="\t") ### The file reads into R incorrectly because of a weird trailing tab ### character in the data file, so use the below code to shift column ### names to the correct column. names(ninds)[1] <- "XXX" ninds$dbGaP_Subject_ID <- row.names(ninds) ninds$smokingStatus <- NULL names(ninds) <- c(names(ninds)[2:17], "smokingStatus", "dbGaP_Subject_ID") ### subset the only variables needed pheno <- subset(ninds, select=c("subject_id", "smokingStatus", "age", "gender", "AFFECTION_STATUS")) ###————————————————————### ### SMOKING INITIATION ### ###————————————————————### ### NINDS variable is “smokingStatus” ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. ### ### ’CURRENT' defined as cigarette smoking within last 30 days, ### 'FORMER' defined as ### more than 100 cigarettes in one's lifetime ### but no smoking within the last 30 ### days; ’NEVER' defined as ### less than 100 cigarettes smoked in one's lifetime. ### table(pheno$smokingStatus) ### ### CURRENT FORMER NEVER NO UNKNOWN ### 63 727 1137 2397 4 251 si <- pheno$smokingStatus si[si == "CURRENT" | si == "FORMER"] <- 2 si[si == "NEVER" | si == "NO"] <- 1 si[si != "1" & si != "2"] <- NA si <- as.numeric(si) ### table(si) ### ### 1 2 ### 2401 1864 ###————————————————————-### ### SMOKING CESSATION ### ###————————————————————-### ### NINDS variable is “smokingStatus” ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. ### ### ’CURRENT' defined as cigarette smoking within last 30 days, ### 'FORMER' defined as more than 100 cigarettes in one's lifetime but ### no smoking within the last 30 days; ’NEVER' defined as less than ### 100 cigarettes smoked in one's lifetime. ### table(pheno$smokingStatus) ### ### CURRENT FORMER NEVER NO UNKNOWN ### 63 727 1137 2397 4 251 sc <- pheno$smokingStatus sc[sc == "CURRENT"] <- 2 sc[sc == "FORMER"] <- 1 sc[sc != "1" & sc != "2"] <- NA sc <- as.numeric(sc) ### table(pheno$V5) ### ### 1 2 ### 1137 727 ###——————————### ### GENDER ### ###——————————### ### NINDS variable is “gender” ### Variables are “F” and “M” ### table(pheno$gender) ### ### F M ### 2627 1952 sex <- pheno$gender sex[sex == "M"] <- 1 sex[sex == "F"] <- 2 sex <- as.numeric(sex) ###----------------### ### Write to files ### ###----------------### ### This study uses the "subject_id" ID field in the genotype files, ### so use that here, instead of the dbGaP_Subject_ID phenotypes <- data.frame(fid = pheno$subject_id, iid = pheno$subject_id, patid = "x", matid = "x", sex = sex, si = si, sc = sc) write.table(phenotypes, "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_phenotypes.ped", row.names=F, quote = F, sep="\t") covariates <- data.frame(fid = pheno$subject_id, iid = pheno$subject_id, patid = "x", matid = "x", sex = sex, age = pheno$age, age2 = pheno$age^2, AFFECTION_STATUS = pheno$AFFECTION_STATUS) write.table(covariates, "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_covariates.ped", row.names=F, quote = F, sep="\t") ### save table ====== BEAGESS ====== ===== Phenotypes ===== options(stringsAsFactors=F) beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F) ### The file reads into R incorrectly because of a weird trailing tab ### character in the data file, so use the below code to shift column ### names to the correct column. names(beagess)[1] <- "XXX" beagess$dbGaP_Subject_ID <- row.names(beagess) beagess$assocEABEvsCO <- NULL names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID") ### SMOKING INITIATION ### ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### "Cigarette smoking status." ### Response options are "-99", "-9", "0", "1" and "2". ### -99 = not consented ### -9 = Missing ### 0 = Never ### 1 = Former ### 2 = Current ### ### Descriptives: ### ### > table(beagess_data$BMI) ### ### -99 -9 0 1 2 ### 494 2051 1515 2288 575 ### ### > summary(beagess_data$BMI) ### Min. 1st Qu. Median Mean 3rd Qu. Max. ### -99.000 -9.000 0.000 -9.234 1.000 2.000 si <- beagess$cig_smk_status si[si == 1 | si == 2] <- 2 si[si == 0] <- 1 si[si != 1 & si != 2] <- NA ### SMOKING Cessation ### ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". ### "Cigarette smoking status." ### Response options are "-99", "-9", "0", "1" and "2". ### -99 = not consented ### -9 = Missing ### 0 = Never ### 1 = Former ### 2 = Current ### ### Descriptives: ### table(beagess$cig_smk_status) ### -99 -9 0 1 2 ### 494 2051 1515 2288 575 ### ### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. sc <- beagess$cig_smk_status sc[sc == 0 | sc == -9 | sc == -99] <- NA ### BEAGESS variable name is "agegpcat" ### "Age in 5 year categories" ### Response options are integers 1 through 14 or -9. ### -9 = Missing ### 1 = 15-29 years of age ### 2 = 30-34 years of age ### 3 = 35-39 years of age ### 4 = 40-44 years of age ### 5 = 45-49 years of age ### 6 = 50-54 years of age ### 7 = 55-59 years of age ### 8 = 60-64 years of age ### 9 = 65-69 years of age ### 10 = 70-74 years of age ### 11 = 75-79 years of age ### 12 = 80-84 years of age ### 13 = 85-89 years of age ### 14 = 90-100 years of age ### ### Descriptives: ### ### > table(beagess_data$agegpcat) ### ### -9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ### 27 19 48 112 190 378 657 958 1134 1238 1018 794 246 87 17 ### ### This is fine, but change the -9 to NA beagess$agegpcat[beagess$agegpcat == -9] <- NA ### looks like BEAGESS uses their own internal "SUBJECT_ID" in the ### genotype files, so we'll use that in our pedigree files phenotypes <- data.frame(fid = beagess$SUBJECT_ID, iid = beagess$SUBJECT_ID, patid = "x", matid = "x", sex = beagess$sex, si = si, sc = sc) phenotypes[is.na(phenotypes)] <- "x" write.table(phenotypes, "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped", row.names=F, quote = F, sep="\t") covariates <- data.frame(fid = beagess$SUBJECT_ID, iid = beagess$SUBJECT_ID, patid = "x", matid = "x", sex = beagess$sex, age = beagess$age, age2 = beagess$age^2, Barretts.esophagus.case_v_control = beagess$assocBEvsCO, Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO) covariates[is.na(covariates)] <- "x" write.table(covariates, "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped", row.names=F, quote = F, sep="\t") ### save table ====== Jackson Heart Study ====== ===== Phenotypes ===== hmb <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.JHS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz", header=T, sep="\t", stringsAsFactors=F) npu <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz", header=T, sep="\t", stringsAsFactors=F) phens <- rbind(hmb, npu) hmb_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.fam", header=F, stringsAsFactors=F) npu_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.fam", header=F, stringsAsFactor=F) geno <- rbind(hmb_geno, npu_geno) jhs_map <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.MULTI/JHS_subject_sample_map_8_10_2010.txt", header=T, sep="\t", stringsAsFactors=F) jhs_pcs <- read.table("/rc_scratch/hayo0753/JHS_files/JHS_ancestry_and_pcs.txt", header=T, stringsAsFactors=F) names(jhs_pcs) [2] <- "SUBJID" names(geno) <- c("SUBJID", "iid", "patid", "matid", "sex", "phen") filtered_geno <- geno[(geno$SUBJID %in% phens$SUBJID),] filtered_phens <- phens[(phens$SUBJID %in% geno$SUBJID),] mapped_geno <- merge(filtered_phens, filtered_geno, by="SUBJID", all.x=T) mapped_geno_pcs <- merge(mapped_geno, jhs_pcs, by="SUBJID", all.x=T) covariates <- mapped_geno_pcs[c(1,69,70:72,5,75:84)] names(covariates) [1] <- "fid" names(covariates) [2] <- "iid" names(covariates) [6] <- "age" ###--------------------### ### Smoking initiation ### ###--------------------### ### ### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline" ### current_smoker_baseline = Current smoking status at first participant visit ### former_smoker_baseline = Former smoking status at first participant visit ### ### Response options are ### 0 - No ### 1 - Yes ### ### table(si) ### ### 0 1 x ### 1206 537 9 current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID")) former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline", "SUBJID")) N <- nrow(mapped_geno_pcs) si <- rep(NA, N) for(i in 1:N){ if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){ si[i] <- "x" } else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){ si[i] <- "1" } else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){ si[i] <- 2 ### former smokers are coded as 2 } else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){ si[i] <- 2 ### current smokers are coded as 2 } } mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs) ###--------------------### ### Smoking Cessation ### ###--------------------### ### ### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline" ### current_smoker_baseline = Current smoking status at first participant visit ### former_smoker_baseline = Former smoking status at first participant visit ### ### Response options are ### 0 - No ### 1 - Yes ### ### table(sc) ### ### 0 1 x ### 1206 537 9 current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID")) former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline","SUBJID")) N <- nrow(mapped_geno_pcs) sc <- rep(NA, N) for(i in 1:N){ if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){ sc[i] <- "x" } else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){ sc[i] <- "x" } else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){ sc[i] <- 1 ### former smokers are coded as 1 } else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){ sc[i] <- 2 ### current smokers are coded as 2 } } phenotypes <- cbind(sc, mapped_geno_pcs_phen) phenotypes <- phenotypes[c(3,71,72:74,1:2)] names(phenotypes)[1] <- "fid" write.table(covariates, "AFR.JHS.covariates.ped", row.names=F, quote=F) write.table(phenotypes, "AFR.JHS.phenotypes.ped", row.names=F, quote=F) ======= Genotype Processing ======= ===== Pre-Phasing QC ===== QC parameters that we chose: MAF > 0.01 SNP callrate > 0.95 Missingness per individual > 0.95 HWE = 0.05 / number of markers but greater than 5e-8 ## To update the strand builds: http://www.well.ox.ac.uk/~wrayner/strand/ ## Check strands against latest 1000G: http://www.well.ox.ac.uk/~wrayner/tools/ perl HRC-1000G-check-bim.pl \ -b ARIC_b37_filtered.bim \ -f ARIC_b37_filtered.frq \ -r 1000GP_Phase3_combined.legend \ -g \ -p EUR ## Phasing using shapeit (ARIC used as an example) shapeit -B ARIC_b37_filtered-updated-chr${1} -M /rc_scratch/meli7712/dbGAP/1000GP_Phase3/genetic_map_chr${1}_combined_b37.txt -O phased/ARIC_b37_filtered-updated-chr${1}.phased -T 48 ## To convert the shapeit output into vcf shapeit -convert --input-haps mesa-chr${1}.phased \ --output-vcf mesa-chr${1}.phased.vcf \ -T 12 ## Imputation Minimac3-omp3 --haps mesa-chr${1}.phased.vcf \ --cpus 48 --refHaps /rc_scratch/meli7712/dbGAP/references/${1}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz \ --chr ${1} \ --noPhoneHome \ --format GT,DS,GP \ --allTypedSites \ --prefix mesa-chr${1}.phased.imputed