======= UK Biobank =======

More information about the files used for [[uk_biobank|UKBiobank are here]]. In brief, we used the UK10K + 1kgp3 imputed vcfs provided by UKBionank and added in dosages w/ this python script:

import gzip, argparse, re, os, datetime
from subprocess import Popen, PIPE

def add_dosage(pair):
        a, b = pair
        probs = b.split(b',')
        dose = float(probs[1]) + (float(probs[2]) * 2)
        return a + b':' + str(dose).encode('ascii') + b':' + b

def gziplines(fname):
  f = Popen(['zcat', fname], stdout=PIPE)
  for line in f.stdout:
      yield line

parser = argparse.ArgumentParser()
parser.add_argument('inputVCF', help = 'The path to the VCF')
args = parser.parse_args()

flag = False

for line in gziplines(args.inputVCF):
        if line.startswith(b'#'):
                os.write(1, line.rstrip() + b'\n')
                if not flag:
                        os.write(1, b'##FORMAT=<ID=DS,Number=1,Type=Float,Description="Genotype Dosages">\n')
                        os.write(1, b'##Dosages added using the script add.dosages.subprocess.py at ' +
                                str(datetime.datetime.now()).encode('ascii') + b'\n')
                        flag = True
        else:
                elements = re.split(b'\t|:', line.rstrip())
                first8 = elements[:8]
                genotypes = elements[10:]
                form = b'GT:DS:GP'

                genotypes_split = zip(genotypes[::2], genotypes[1::2])
                try:
                        dose_genos = [add_dosage(pair) for pair in genotypes_split]
                except (ValueError, IndexError) as e:
                        os.write(2, "\n" + line)
                        os.write(2, line + "\n" + args.inputVCF + "\n\n")
                        raise e
                os.write(1, b'\t'.join(first8) + b'\t' + form + b'\t' + b'\t'.join(dose_genos) + b'\n')


======= dbGaP Studies =======


====== Framingham ======


===== Phenotypes =====


options(stringsAsFactors=F)

### Offspring cohort
off <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000030.v7.p10.c1.ex1_1s.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")

### Generation 3 cohort
g3 <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000074.v9.p10.c1.ex3_1s.HMB-IRB-MDS.txt.gz"), header=T, skip=10, sep="\t", fill=T)

###---------------------###
### Cigarettes per Day  ###
###---------------------###

### Offspring Cohort Exam 1
### Framingham variable name is "A102".
###   "Usual number of cigarettes smoked (now or formerly)"
### Response option is an integer value
###
### table(off$A102)
###     0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
###  1208   50   29   44   26   47   31   16   30    3  175    1   20    1    9   94
###    16   17   18   19   20   22   24   25   26   27   28   30   34   35   40   42
###     5    6   50    1  556   10    3   39    1    1    2  214    1   15  147    2
###    45   50   60   80   88   90
###     4   21   19    1   14    2
###
### summary(off$A102)
###  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
###  0.00    0.00    7.00   12.11   20.00   90.00       8

cpd.off <- off$A102
cpd.off[cpd.off==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.off[cpd.off > 60] <- 60 ### Set those who report smoking >40 cpd to 40

cpd.off[cpd.off <=  5 & cpd.off >=  1] <- 1
cpd.off[cpd.off <= 15 & cpd.off >=  6] <- 2
cpd.off[cpd.off <= 25 & cpd.off >= 16] <- 3
cpd.off[cpd.off <= 35 & cpd.off >= 26] <- 4
cpd.off[cpd.off >= 36 & cpd.off <= 60] <- 5
cpd.off[cpd.off > 60 | is.na(cpd.off)] <- NA

### G3 cohort Exam 1
### Framingham variable name is "G3A074
###   "IF EVER SMOKED CIGS REGULARLY: ON THE AVERAGE
###    OF THE ENTIRE TIME YOU SMOKED, HOW MANY CIGARETTES
###    DID YOU SMOKE PER DAY?"
### Response option is an integer value
###
### table(g3$G3A074)
###
###    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
### 3180   57   14   15   20   30    8   13    7    2  122    2   16    3    2   60 
###   16   17   18   19   20   21   22   23   24   25   28   30   35   40   45   50 
###    3    3   10    1  287    2    1    1    1   12    1   53    1   47    1   12 
###   55   60   80 
###    1    6    1
###
### summary(g3$G3A074)
###    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
###   0.000   0.000   0.000   3.528   0.000  80.000

cpd.g3 <- g3$G3A074
cpd.g3[cpd.g3==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.g3[cpd.g3 > 60] <- 60 ### Set those who report smoking >40 cpd to 40

cpd.g3[cpd.g3 <=  5 & cpd.g3 >=  1] <- 1
cpd.g3[cpd.g3 <= 15 & cpd.g3 >=  6] <- 2
cpd.g3[cpd.g3 <= 25 & cpd.g3 >= 16] <- 3
cpd.g3[cpd.g3 <= 35 & cpd.g3 >= 26] <- 4
cpd.g3[cpd.g3 >= 36 & cpd.g3 <= 60] <- 5
cpd.g3[cpd.g3 > 60 | is.na(cpd.g3)] <- NA

###--------------------###
### Smoking initiation ###
###--------------------###
### Offspring Cohort Exam 1
### Framingham variable is "A99"
###     "Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in 
###     0 is no, 1 is yes (now), 2 is former
###
### table(off$A99)
###
###    0    1    2
### 1203 1126  566

si.off <- off$A99
si.off[si.off==1] <- 2 ### Set current smoker "1" to smoker "2"
si.off[si.off==0] <- 1 ### Set never smoker "0" to nonsmoker "1"
                       ### Former smoker "2" remains as a "2" as smoker 


### G3 Cohort Exam 1
### Framingham variables is "G3A070"
###     G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS
###              LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF TOBACCO IN A 
###              LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)"
### Response options:
###              "0" is no, "1" is yes,  "0" is unknown
###
### table(g3$G3A070)
###    0    1   20 
### 2137 1629    1
si.g3 <- g3$G3A070
si.g3[si.g3==20] <- NA
si.g3[si.g3==1] <- 2
si.g3[si.g3==0] <- 1


###---------------------###
###  Smoking cessation  ###
###---------------------###

### Offspring Cohort Exam 1
### Framingham variable is "A99"
###     Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in
###     0 is no, 1 is yes (now), 2 is former
### 
### table(off$A99)
###
###    0    1    2
### 1203 1126  566

sc.off <- off$A99 
sc.off[sc.off==2] <- 3 ### temporarily set former smoker "2" as "3"
sc.off[sc.off==1] <- 2 ### set current smoker "1" as current smoker "2"
sc.off[sc.off==3] <- 1 ### set former smoker "3" as former smoker "1"
sc.off[sc.off==0] <- NA ### code all non smokers "0" as NA


### G3 Cohort Exam 1
### Framingham variables are “G3A070” and “G3A072”
### Response option is fill-in
### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF ###TOBACCO IN A LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)
###“0” is no, “1” is yes,  “20” is unknown
### G3A072 - IF EVER SMOKED CIGARETTES REGULARLY: DO YOU NOW SMOKE CIGARETTES (AS OF 1 MONTH AGO)?
###“0” is no or never smoked, “1” is yes
###
### table(g3$G3A070)
###    0    1   20
### 2137 1629    1
###
### table(g3$G3A072)
### 0    1
### 3179  585
sc.g3 <- g3$G3A072
sc.g3[sc.g3==1] <- 2 ## These are our current smokers
sc.g3[sc.g3==0] <- 1
sc.g3 <- ifelse(sc.g3==1 & (g3$G3A070 == 0 | is.na(g3$G3A070)), NA, sc.g3)


###---------------------###
###  Age of initiation  ###
###---------------------###

### Offspring Cohort Exam 1
### Framingham variable "A100"
###     Age started smoking cigarettes regularly\
### Response option is an integer value, 88 is doesn't smoke regularly
### table(off$A100)
### 
###   0   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25
###   6   2   2   2   8   4  24  37  61 147 266 233 330 166 164  85  42  26  13  25 
###   26  27  28  29  30  31  33  34  35  37  38  39  40  44  45  46  51  88 
###   2   8   4   3   7   1   1   1   3   1   2   1   3   1   1   2   1 105 
###

ai.off <- off$A100
ai.off[ai.off < 10] <- NA ### Set AI less than 10 as missing\
ai.off[ai.off > 35] <- NA ### Set AI greater than 35 as missing \


### G3 Cohort Exam 1
### Framingham variable "G3A075"
###     IF EVER SMOKED CIGARETTES REGULARLY: HOW OLD WERE YOU WHEN YOU FIRST STARTED REGULAR CIGARETTE SMOKING?
### Response option is an integer value, 0 is never smoked
###  table(g3$G3A075)
###
###    0    5    7    8    9   10   11   12   13   14   15   16   17   18   19   20
### 2138    1    2    2    3    5   14   52  108  159  152  277  193  256   98  107
###   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36
###   51   31   19   13   31    6    4    6    2   13    2    1    1    4    1    4
###   37   38   40   42   46   50
###    2    2    1    1    1    2
###

ai.g3 <- g3$G3A075
ai.g3[ai.g3 < 10] <- NA ### Set AI less than 10 as missing
ai.g3[ai.g3 > 35] <- NA ### Set AI greater than 35 as missing


###-------------------###
###  Drinks per week  ###
###-------------------###

### Offspring Cohort Exam 1 
### Framingham variables "A111", "A112", "A113"
###     A111 - Beer (bottles, cans or glasses per week)
###     A112 - Wine (glasses per week)
###     A113 - Cocktails, highballs, straight drinks (# per week)
### Response options are integer values. 
### table(off$A111)
###
###    0    1    2    3    4    5    6    7    8    9   10   12   14   15   16   18 
### 1452  524  171  131   78   49  125   34   33    4   46   68   21   21    5   13 
###   20   21   24   25   26   28   30   35   36   40   42   48   49   50   70   90 
###   18    7   48    5    1    6    6    2    2    4    4    6    1    2    1    1 
###
### table(off$A112)
###
###    0    1    2    3    4    5    6    7    8    9   10   12   14   15   16   18 
### 1329  919  218  105   69   41   22   64   25    3   25   11   30    2    2    2 
###   20   21   24   28   30   32   50 
###    7    4    2    3    1    2    1 
###
### table(off$A113)
### 
###    0    1    2    3    4    5    6    7    8    9   10   12   14   15   16   18 
###  861 1094  267  157   96   64   59   65   31    9   49   16   57   13    2    1
###   19   20   21   24   25   28   30   35   38   42   50   55   60   61 
###    1   12   10    4    2    2    6    2    1    1    1    1    1    1 
###
dpw.off <- rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T)
### The above assumes all drinks are of equal ETOH content
dpw.off[dpw.off == 0] <- NA
dpw.off <- log(dpw.off)


### G3 Cohort Exam 1\
### Framingham variables are "G3A115", "G3A116", "G3A119", "G3A120",
###                          "G3A123", "G3A124", "G3A127", "G3A128",
###                          "G3A131", "G3A132"
###    G3A115 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A116 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER MONTH OVER THE PAST YEAR
###    G3A119 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A120 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
###    G3A123 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A124 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
###    G3A127 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A128 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER MONTH OVERTHE PAST YEAR
###    G3A131 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A132 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER MONTH OVER THE PAST YEAR
###
###  *Participant was allowed to report alcohol consumption in either drinks per week or drinks per
###   month. Therefore, to calculate total alcohol consumption, you must use both number of drinks per
###   week and number of drinks per month (E.G. DRINKS PER MONTH = SUM(OF (DRINKS PER WEEK)(DRINKS PER
###   MONTH/4)).
###
### Response options are integer values.
###
all <- subset(g3, select = c("G3A115", "G3A116", "G3A119", "G3A120", "G3A123", "G3A124", "G3A127", "G3A128",
                      "G3A131", "G3A132"))

names(all) <- c("beerpw", "beerpm", "wwinepw", "wwinepm", "rwinepw", "rwinepm", "liqpw", "liqpm", "othpw", "othpm")
w <- all[,c(1,3,5,7,9)]
m <- all[,c(2,4,6,8,10)]/4

dpw.tmp <- data.frame(beer = rowSums(cbind(w$beerpw, m$beerpm), na.rm=T),
                      wwine = rowSums(cbind(w$wwinepw, m$wwinepm), na.rm=T),
                      rwine = rowSums(cbind(w$rwinepw, m$rwinepm), na.rm=T),
                      liq = rowSums(cbind(w$liqpw, m$liqpm), na.rm=T),
                      oth = rowSums(cbind(w$othpw, m$othpm), na.rm=T))
                      

### WHITE WINE - convert from 4oz to 5oz drink
dpw.tmp$wwine <- dpw.tmp$wwine*5/4
### RED WINE - convert from 4oz to 5oz drink
dpw.tmp$rwine <- dpw.tmp$rwine*5/4
### LIQUOR - convert from 1.25 to 1.50 drink
dpw.tmp$liq <- dpw.tmp$liq*1.5/1.25
### OTHER DRINK is an unknown. leave as-is.
dpw.g3 <- rowSums(dpw.tmp, na.rm=T)

### We have a few outliers, draw cuttoff at 70 drinks / week
dpw.g3[dpw.g3 > 70] <- NA
dpw.g3[dpw.g3 == 0] <- NA
dpw.g3 <- log(dpw.g3 + .75)

###---------------------------###
###  Drinker vs. non-drinker  ###
###---------------------------###
###
### Use the variables already computed for dpw above. Assume
### anyone who is drinking less than 1 drink / month is a non-drinker
x <- subset(off, select = c("A111", "A112", "A113"))
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
    if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
        index[i] <- 1
    } else {
        index[i] <- 0
    }
}

dnd.off <-  ifelse(rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T) == 0, 1, 2)
dnd.off[index==1] <- NA


x <- all
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
    if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
        index[i] <- 1
    } else {
        index[i] <- 0
    }
}


dnd.g3 <- ifelse(rowSums(all, na.rm=T) == 0, 1, 2)
dnd.g3[index==1] <- NA


###----------------------------###
### Binge drinking in everyone ###
### (Only available for G3)    ###
###----------------------------###
### G3 Cohort Exam 1 
### Framingham variable is "G3A137"
###     IF EVER CONSUMED ALCOHOL: WHAT WAS THE MAXIMUM NUMBER OF
###        DRINKS YOU HAD IN A 24 HOUR PERIOD DURING THE PAST MONTH?
### Response option is an integer value 
### 
### summary(g3$G3A137)
###    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
###   0.000   1.000   3.000   3.566   5.000  50.000       3

bde <- g3$G3A137
sex <- g3$G3A440 ### bde is defined differently for males & females
### sex == 1 is male, sex == 2 is female

tmp <- rep(NA, length(bde)) ### temporary vector to hold results and avoid conflicts

tmp[sex == 1 & bde < 5] <- 1
tmp[sex == 1 & bde >=5] <- 2
tmp[sex == 2 & bde < 4] <- 1
tmp[sex == 2 & bde >=4] <- 2

bde.g3 <- tmp

###-----------------------------------------###
###  Binge-drinking (in lifetime drinkers)  ###
###  (Only available for G3)                ###
###-----------------------------------------###

### G3 Cohort Exam 1
### Framingham variable is "G3A112"
###     HAVE YOU EVER CONSUMED ALCOHOLIC BEVERAGES (BEER, WINE, LIQUOR/SPIRITS)? \
### 0 is no, 1 is yes

lifednd <- g3$G3A112 ## No missings, so next command has no problem
bdl.g3 <- bde.g3
bdl.g3 <- ifelse(lifednd == 0, NA, bde.g3)


##################
##################
###            ###
### Covariates ###
###            ###
##################
##################


### HEIGHT
### Offspring Cohort variable is “A51”
### Generation 3 Cohort variable is “G3A446”

height.off <- off$A51
height.g3 <- g3$G3A446

### WEIGHT
### Offspring Cohort variable is “A50”
### Generation 3 Cohort variable is “G3A444”

weight.off <- off$A50
weight.g3 <- g3$G3A444


###-------------###
### Age at exam ###
###-------------###
### Get BIRTHDATE
birth <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000740.v7.p10.c1.birthyr_alls.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")

off.IDs <- off$dbGaP_Subject_ID
g3.IDs <- g3$dbGaP_Subject_ID

birth.off <- subset(birth, dbGaP_Subject_ID %in% off.IDs)
birth.g3 <- subset(birth, dbGaP_Subject_ID %in% g3.IDs)

### True exam dates spanned 1971 - 1975 for offspring
### True exam dates spanned 2002 - 2005 for generation 3
### A good-enough approximation is to take the middle
### year and subtract birth year to get age at exam

### Offspring Cohort
off.age <- data.frame(dbGaP_Subject_ID = birth.off$dbGaP_Subject_ID,
                      Age = 1973 - birth.off$birthyr)
### Framingham Cohort
g3.age <- data.frame(dbGaP_Subject_ID = birth.g3$dbGaP_Subject_ID,
                     Age = 2004 - birth.g3$birthyr)

age <- rbind(off.age, g3.age)
####################################
####################################
###                              ###
### Create Phenotype Data Frames ###
###                              ###
####################################
####################################

offspring <- data.frame(dbGaP_Subject_ID = off$dbGaP_Subject_ID,
                        shareid = off$shareid,
                        cpd = cpd.off,
                        si  = si.off,
                        sc  = sc.off,
                        ai  = ai.off,
                        dpw = dpw.off,
                        dnd = dnd.off,
                        bde = rep(NA, nrow(off)),
                        bdl = rep(NA, nrow(off)),
                        height = height.off,
                        weight = weight.off,
                        cohort = rep(1, nrow(off)))

generation3 <- data.frame(dbGaP_Subject_ID = g3$dbGaP_Subject_ID,
                          shareid = g3$shareid,
                          cpd = cpd.g3,
                          si  = si.g3,
                          sc  = sc.g3,
                          ai  = ai.g3,
                          dpw = dpw.g3,
                          dnd = dnd.g3,
                          bde = bde.g3,
                          bdl = bdl.g3,
                          height = height.g3,
                          weight = weight.g3,
                          cohort = rep(2, nrow(g3)))


tmp <- rbind(offspring, generation3)

phenotypes <- merge(tmp, age, by="dbGaP_Subject_ID")
phenotypes$age2 <- phenotypes$Age^2


##################
### ID MAPPING ###
##################
### ID mapping file from dbGaP_Subject_ID to SAMPID
ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F)

### Genotype files
genotype.IDs <-  read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F)
names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype")

length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID))
## 6954

x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T)
x <- x[,c(1:5,7)]

### There are many duplicates, which I can remove because I only want
### a single entry for every SAMPID-SUBJID
x <- x[which(!duplicated(x$dbGaP_Subject_ID)),]


almost.final <- merge(phenotypes, x, by="dbGaP_Subject_ID", all.x=T)

####################################################
### Bring in genetic PCs and ancestry categories ###
####################################################

PCs <- read.table("/work/KellerLab/Zhen/FRAMINGHAM/PCA/FRAMINGHAM_pcs_and_ancestries.txt", header=T)
names(PCs)[2] <- c("SAMPID")

final <- merge(almost.final, PCs, by="SAMPID", all.x=T)

phenotypes.ped <- subset(final, ancestry=="EUR",
                         select=c("famid", "SAMPID", "patid",
                             "matid", "sex", "cpd", "ai", "si",
                             "sc", "dpw", "dnd", "bde", "bdl"))
phenotypes.ped[is.na(phenotypes.ped)] <- "x"


covariates.ped <- subset(final, ancestry=="EUR",
                         select=c("famid", "SAMPID", "patid",
                             "matid", "sex", "Age", "age2", "sc",
                             "height", "weight", "cohort",
                             "PC1", "PC2", "PC3", "PC4", "PC5",
                             "PC6", "PC7", "PC8", "PC9", "PC10"))
covariates.ped[is.na(covariates.ped)] <- "x"

write.table(phenotypes.ped, file="Framingham.EUR.phenotypes.ped", quote=F,
            col.names=T, row.names=F)

write.table(covariates.ped, file="Framingham.EUR.covariates.ped", quote=F,
            col.names=T, row.names=F)


===== Genotypes =====

We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam]


====== ARIC ======


===== Phenotypes =====


phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",header=T,sep="\t",stringsAsFactors=F)

phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp", "gender","v1age01", "anta01","anta04", "drnkr01", "hom29", 'hom35', "hom32", 'cigt01','evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98', 'cursmk01','forsmk01'))

### rename phenotypes to be readable
names(phenotypes)[c(1,2,3,4,5,6)] <- c("geneva_id", "race", "sex", "age", "height", "weight")

### To connect sample ids to geneva ids, take SAMPID (the ID used in the genotype fam file, and SUBJID (aka geneva_id) )
id_map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz"),header=T,sep=",",stringsAsFactors=F)[,c(2,1)]
names(id_map) <- c("SAMPID", "geneva_id")
phenotypes <- merge(phenotypes, id_map, by="geneva_id", all.x=TRUE)


### import genotype data to get family info
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam", col.names = c("fam_id", "SAMPID", "patid", "matid", "sex", "dummy"))

### Replace 0's with "x" for rvTest preferred formatting
fam_data$patid <- fam_data$matid <- fam_data$fam_id[fam_data$fam_id == 0] <- "x"


########################################
###---- Derive GSCAN phenotypes -----###
########################################

### DRINKER VERSUS NON-DRINKER
### ARIC variable name is "drnkr01".
### Combination of "Do you presently drink alcoholic beverages?" and "Have you ever consumed alcoholic beverages?" 
###   Response option for both questions are "yes" or "no", which are turned into the options below.  
###   Response options:
###           1 = Current Drinker
###           2 = Former Drinker
###           3 = Never Drinker
###           4 = Unknown
###
### Descriptives: 
### table(phenotypes$drnkr01)
###    1    2    3    4 
### 7257 2309 3153    6 
###
###  To obtain GSCAN "DND" collapse across Former and Never Drinkers
###  and make "Non-Drinkers". Current Drinkers will be made "Drinkers"

dnd <- phenotypes$drnkr01
dnd[dnd == 1] <- "Current Drinker"
dnd[dnd == 2 | dnd == 3] <- 1
dnd[dnd == "Current Drinker"] <- 2
dnd[dnd == 4 | is.na(dnd)] <- "x"

### AGE OF INITIATION OF SMOKING
###
### ARIC variable name is "hom29".
###    "How old were you when you first started regular cigarette smoking?"
###    Response option is an integer value.
###
### Descriptives: 
###
### > table(phenotypes$hom29)
###     0    1    4    5    6    7    8    9   10   11   12   13   14   15   16   17
###    19    1    2   10   11   15   22   32   65   44  154  187  302  659  941  715
###    18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33
###  1219  567  703  447  275  129   88  247   56   45   59   22  100    8   34    8
###    34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49
###    17   51    9    9   10    8   35    3   11    5    5   16    3    4    2    3
###    50   51   52   55   57   59   60   62
###     7    1    2    1    1    2    2    1
###
### > summary(phenotypes$hom29)
###   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
###   0.00   16.00   18.00   18.77   20.00   62.00    5377 
###
ai <- phenotypes$hom29
### remove ages older than 35 and younger than 10
ai[ai > 35 | ai < 10 | is.na(ai)] <- "x"


### CIGARETTES PER DAY
### ARIC variable name is "hom35"
###    "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?"
###    Response option is integer, or "0" for <1 cigarette per day
###
### > table(phenotypes$hom35)
###     0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
###    46   54  118  187  133  254  146   84   89   17  990   23  106   30   12  520
###    16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31
###    31   30   62    5 2559    1    5   10    5  193    9    3    7    6  771    3
###    32   33   34   35   36   37   38   40   42   43   45   50   51   54   55   58
###     1    1    1   44    3    1    1  572    1    3   14   72    1    1    3    1
###    60   65   70   75   80   86   90   99
###   100    1    4    2   10    1    1    3
### > summary(phenotypes$hom35)
###    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
###    0.00   10.00   20.00   19.67   24.00   99.00    5420
### Responses are binned in accordance with the GSCAN Analysis Plan. 
cpd <- phenotypes$hom35
cpd[cpd <=  5 & cpd >=  1] <- 1
cpd[cpd <= 15 & cpd >=  6] <- 2
cpd[cpd <= 25 & cpd >= 16] <- 3
cpd[cpd <= 35 & cpd >= 26] <- 4
cpd[cpd >= 36 & cpd <= 60] <- 5
cpd[cpd > 60 | is.na(cpd)] <- "x"


### DRINKS PER WEEK
### ARIC variable names are "dtia96", "dtia97", and "dtia98"
###     "dtia96" - "How many glasses of wine do you usualy have per week? (4oz. glasses; round down)." 
###     "dtia97" - "How many bottles of cans or beer do you usualy have per week? (12oz. bottles or cans; round down)." 
###     "dtia98" - "How many drinks of hard liquor do you usualy have per week? (4oz. glasses; round down)."
###      Response option for all three is integer. 
###
###  Descriptives:
###
### >table(phenotypes$dtia96)
###  0    1    2    3    4    5    6    7    8    9   10   11   12   14   15   16 
###  5226  844  461  255  147   90   75   50   27    3   34    1   15   28    9    2 
###  17   18   20   21   25   28   30   32   33   35   40 
###   1    3    7    5    1    2    3    1    1    1    1 
###
### >summary(phenotypes$dtia96)
### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
###  0.000   0.000   0.000   0.868   1.000  40.000    5478 
###
### >table(phenotypes$dtia97)
### 0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
### 4356  674  528  312  214  107  297   93   76   10   97    3  186    4   40   28 
###  16   18   19   20   21   22   23   24   25   28   30   32   33   35   36   40 
###   5   28    1   36   19    1    1   89    8    8   12    2    1   10    8    6 
###  42   45   48   49   50   56   60   63   70   72   80   92 
###  13    2    6    1    3    2    4    1    1    2    1    1 
###
### >summary(phenotypes$dtia97)
### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
###  0.000   0.000   0.000   2.609   2.000  92.000    5474 
###
### >table(phenotypes$dtia98)
###  0    1    2    3    4    5    6    7    8    9   10   11   12   14   15   16 
###  5226  844  461  255  147   90   75   50   27    3   34    1   15   28    9    2 
###  17   18   20   21   25   28   30   32   33   35   40 
###   1    3    7    5    1    2    3    1    1    1    1 
###
### >summary(phenotypes$dtia96)
### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
###  0.000   0.000   0.000   0.868   1.000  40.000    5478 
###
### >table(phenotypes$dtia97)
### 0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
### 4356  674  528  312  214  107  297   93   76   10   97    3  186    4   40   28 
###  16   18   19   20   21   22   23   24   25   28   30   32   33   35   36   40 
###   5   28    1   36   19    1    1   89    8    8   12    2    1   10    8    6 
###  42   45   48   49   50   56   60   63   70   72   80   92 
###  13    2    6    1    3    2    4    1    1    2    1    1 
###
### >summary(phenotypes$dtia97)
### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
###  0.000   0.000   0.000   2.609   2.000  92.000    5474 
###
### >table(phenotypes$dtia98)
###  0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
### 4387  735  551  295  239  190  148  138   70   11  151   14   57    3  103   33 
###  16   17   18   20   21   24   25   26   27   28   30   32   33   34   35   36 
###   9    9    7   36   30    1   10    1    1    9    6    2    1    2    4    1 
###  39   40   44   45   47   48   50   51   52   54   55   56   63   64   75   77 
###   1    7    1    1    1    2    5    1    1    1    1    3    1    2    1    2 
###  90   99 
###   1    2 
###
### >summary(phenotypes$dtia98)
### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
###  0.000   0.000   0.000   2.227   2.000  99.000    5483 

wine <- phenotypes$dtia96 # 1 drink = 4oz
beer <- phenotypes$dtia97 # 1 drink = 12oz
spirits <- phenotypes$dtia98 # 1 drink = 1.5oz

### muliply wine by 4 and divide wine by 5 to normalize to standard drink of 5 oz for wine
### Combine all alcohol types, left-anchor at 1, and log
dpw <- log((wine*4/5 + beer + spirits) + 1)
dpw[is.na(dpw)] <- "x"


### SMOKING INITIATION
### ARIC variable name is "evrsmk01"
### The variable checks answers to "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?".
###    Response options are "yes" or "no". 
###
### Descriptives:
###
### >table(phenotypes$evrsmk01)
### 0    1 
### 5328 7434 
###
### >summary(phenotypes$evrsmk01)
### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
### 0.0000  0.0000  1.0000  0.5825  1.0000  1.0000       9 

si <- phenotypes$evrsmk01
si[si == 1] <- 2
si[si == 0] <- 1
si[is.na(si)] <- "x"


### SMOKING CESSATION
### ARIC variable names are "cursmk01" and "forsmk01"
### Both varaiables take into account the questions: "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?"
###    Response options are "yes" or "no". 
###    Smoking inititation (si) is coded as "2" for "Smoker" if "yes" to "Have you ever smoked cigarettes?"
###            If a subsequent "yes" to "Do you now smoke cigarettes?", smoking cessation (sc) is coded as "2" for "Current Smoker".
###            If a subsequent "no" to "Do you now smoke cigarettes?",  smoking cessation (sc) is coded as "1" for "Former Smoker".       
###    Smoking inititation (si) is coded as "1" for "Non Smoker" if "no" to "Have you ever smoked cigarettes?" 

current.smoker <- subset(phenotypes, select=c("cursmk01"))
former.smoker <- subset(phenotypes, select=c("forsmk01"))
N <- nrow(phenotypes)
sc <- rep(NA, N)
for(i in 1:N){
  if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
    sc[i] <- NA
  }
  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 0){
    sc[i] <- NA
  }
  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 1){
    sc[i] <- 1 ### former smokers are coded as 1
  }
 else if (current.smoker[i,1]  == 1 & former.smoker[i,1] == 0){
    sc[i] <- 2 ### current smokers are coded as 2
  }
}
sc[is.na(sc)] <- "x"


### Create dataframe with our new GSCAN variables
N <- nrow(phenotypes)
NAs <- rep("x", N)
gscan.phenotypes <- data.frame(SAMPID = phenotypes$SAMPID, famid = NAs,geneva_id = phenotypes$geneva_id,patid = NAs,matid = NAs,sex = ifelse(phenotypes$sex == "M", 1, 2),cpd = cpd,ai = ai,si = si,sc = sc,dnd = dnd,dpw = dpw,age = phenotypes$age,age2 = phenotypes$age^2,height = phenotypes$height,weight = phenotypes$weight,currentformersmoker = sc)

### Merge in the SAMPID, which is used in the genotype files
gscan.phenotypes <- merge(gscan.phenotypes, id_map, by="geneva_id", all.x=TRUE)

### Reorder phenotype file to make pedigree file consistent with genotype IDs
gscan.phenotypes <- gscan.phenotypes[c(17,2,16,3:15)]
colnames(gscan.phenotypes) [2] <- "SAMPID"

### Read in PCs and add to pedigree file, then write out to a phenotype and covariate file
### [ here read in PCs and merge into phenotype file (probably by the SAMPID) ]
pcs <- read.table("/rc_scratch/hayo0753/aric/aric_ancestry_and_pcs", head=TRUE, stringsAsFactors=F)
colnames(pcs) [1] <- "SAMPID"
pcs <- merge(pcs, gscan.phenotypes, by="SAMPID", all.x=TRUE)

############# PRELIMINARY #####################

### Write to file [NOTE TO HANNAH: will have to be changed once we
### have PCs and ancestry groups identified. PCs will have to be read
### in like with read.table() and we'll have to subset the dataset
### into European and African ancestry, and then write out one
### phenotype and covariate file per ancestry group.


### EUROPEANS
phenotypes.EUR.ped <- subset(pcs, ancestry == "EUR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.EUR.ped, file="ARIC.EUR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t")

covariates.EUR.ped <- subset(pcs, ancestry == "EUR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10"))
write.table(covariates.EUR.ped, file="ARIC.EUR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t")

phenotypes.AFR.ped <- subset(pcs, ancestry == "AFR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.AFR.ped, file="ARIC.AFR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t")

covariates.AFR.ped <- subset(pcs, ancestry == "AFR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10"))
write.table(covariates.AFR.ped, file="ARIC.AFR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t")


### Must remove duplicates from all files, use UNIX
### sort ARIC.EUR.phenotypes1.ped | uniq > ARIC.EUR.phenotypes.ped
### sort ARIC.EUR.covariates.ped | uniq > ARIC.EUR.covariates.ped
### sort ARIC.AFR.phenotypes1.ped | uniq > ARIC.AFR.phenotypes.ped
### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped


====== MESA ======

(Hannah/Joyce to update following Framingham as a guide)


===== Phenotypes =====

Description of phenotypes can be found here: {{:file_mesa_phenotypes_-_final.pdf}}


====== eMERGE ======


===== Phenotypes =====


options(stringsAsFactors=F)


### eMERGE is broken into different consent classes. We can conduct analyses on hmb, hmb-gso-nic, and 


emerge.hmb <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c1.MergedSet_Subject_Phenotypes.HMB.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
emerge.hmb.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/GenotypeFiles/matrix/c1.HMB/eMerge_660_11212012_c1.fam", header=FALSE, sep="\t", stringsAsFactors=F)


emerge.hmb.gso.nic <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c3.MergedSet_Subject_Phenotypes.HM-B-GSO-NIC.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
emerge.hmb.gso.nic.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/GenotypeFiles/matrix/c3.HM-B-GSO-NIC/eMerge_660_11212012_c3.fam", header=FALSE, sep="\t", stringsAsFactors=F)


emerge.hmb.gso <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c4.MergedSet_Subject_Phenotypes.HMB-GSO.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
emerge.hmb.gso.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/GenotypeFiles/matrix/c4.HMB-GSO/eMerge_660_11212012_c4.fam", header=FALSE, sep="\t", stringsAsFactors=F)


### Merge all files above according to SUBJID, which is used in the
### genotype files.

emerge <- merge(emerge.hmb, emerge.hmb.gso, all=T)
emerge <- merge(d, emerge.hmb.gso.nic, all=T)

### SMOKING INITIATION
###
### The eMERGE variable name is SMOKING_STATUS
###      C65108 = never smoker
###      C67147 = current smoker
###      C67148 = past smoker
###      C67151 = Unknown if ever smoked
###
### Descriptives:
###
### table(emerge$SMOKING_STATUS)
###
### C65108 C67147 C67148 C67151 
###   2217   1736   3457   9635 

si <- emerge$SMOKING_STATUS
si[si == "C67147" | si == "C67148"] <- 2
si[si == "C65108"] <- 1
si[si != 1 & si != 2] <- NA

### SMOKING Cessation
###
### Current == 2 & Former == 1 in GSCAN. This is already the case for these data.

sc <- emerge$SMOKING_STATUS
sc[sc == "C67147"] <- 2
sc[sc == "C67148"] <- 1
sc[sc != 1 & sc != 2] <- NA


### eMERGE age variable is tricky because there is no obvious age at
### assessment. We will use their "DECADE_BIRTH" as a terrible
### approximation.
### 1=1900-1919; 2=1920-1929, 3=1930-1939; 4=1940-1949; 5=1950-1959; 6=Unknown
###
### Descriptives:
###
### table(emerge$DECADE_BIRTH)
###
###   .    1    2    3    4    5    6    7    8    9   99 
###   6  612 2667 3533 4439 3127 1291  761  490   10  109 
birthyear <- emerge$DECADE_BIRTH
birthyear[birthyear == "99"] <- NA
birthyear[birthyear == "."] <- NA


### SEX
sex <- emerge$SEX
sex[sex == "C46109"] <- 1
sex[sex == "C46110"] <- 2


### Scott decided not to correct for additional case-control variables
### given what appears to be a highly complex sample and uncertainty
### about the best course of action to account for disease status in
### conducting smoking analyses.

phenotypes <- data.frame(fid = emerge$SUBJID,
                         iid = emerge$SUBJID,
                         patid = "x",
                         matid = "x",
                         sex = sex,
                         si = si,
                         sc = sc)

phenotypes[is.na(phenotypes)] <- "x"

write.table(phenotypes,
            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_phenotypes.ped",
            row.names=F,
            quote = F,
            sep="\t")


covariates  <- data.frame(fid = emerge$SUBJID,
                          iid = emerge$SUBJID,
                          patid = "x",
                          matid = "x",
                          sex = sex,
                          birthyear = birthyear)                         

covariates[is.na(covariates)] <- "x"

write.table(covariates,
            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_covariates.ped",
            row.names=F,
            quote = F,
            sep="\t")


====== Stroke ======


===== Phenotypes =====


### Date: Feb 13 2017
### Author: Scott Vrieze

options(stringsAsFactors=F)

### Load in dataset ###

ninds <- read.table("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz",
                    header = T, sep="\t")

### The file reads into R incorrectly because of a weird trailing tab
### character in the data file, so use the below code to shift column
### names to the correct column.
names(ninds)[1] <- "XXX"
ninds$dbGaP_Subject_ID <- row.names(ninds)
ninds$smokingStatus <- NULL
names(ninds) <- c(names(ninds)[2:17], "smokingStatus", "dbGaP_Subject_ID")
    

### subset the only variables needed
pheno <- subset(ninds, select=c("subject_id", "smokingStatus", "age", "gender", "AFFECTION_STATUS"))


###————————————————————###
### SMOKING INITIATION ###
###————————————————————###

### NINDS variable is “smokingStatus”
### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
###
### ’CURRENT' defined as cigarette smoking within last 30 days,
### 'FORMER' defined as ### more than 100 cigarettes in one's lifetime
### but no smoking within the last 30 ### days; ’NEVER' defined as
### less than 100 cigarettes smoked in one's lifetime.
### table(pheno$smokingStatus)
###
###        CURRENT  FORMER   NEVER      NO UNKNOWN
###     63     727    1137    2397       4     251

si <- pheno$smokingStatus
si[si == "CURRENT" | si == "FORMER"] <- 2
si[si == "NEVER"   | si == "NO"] <- 1
si[si != "1" & si != "2"] <- NA
si <- as.numeric(si)

### table(si)
###
###    1    2
### 2401 1864


###————————————————————-###
###  SMOKING CESSATION  ###
###————————————————————-###

### NINDS variable is “smokingStatus”
### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
###
### ’CURRENT' defined as cigarette smoking within last 30 days,
### 'FORMER' defined as more than 100 cigarettes in one's lifetime but
### no smoking within the last 30 days; ’NEVER' defined as less than
### 100 cigarettes smoked in one's lifetime.
### table(pheno$smokingStatus)
###
###        CURRENT  FORMER   NEVER      NO UNKNOWN
###     63     727    1137    2397       4     251

sc <- pheno$smokingStatus
sc[sc == "CURRENT"] <- 2
sc[sc == "FORMER"] <- 1
sc[sc != "1" & sc != "2"] <- NA
sc <- as.numeric(sc)

### table(pheno$V5)
###
###    1    2
### 1137  727


###——————————###
###  GENDER  ###
###——————————###

### NINDS variable is “gender”
### Variables are “F” and “M”
### table(pheno$gender)
###
###    F    M
### 2627 1952

sex <- pheno$gender
sex[sex == "M"] <- 1
sex[sex == "F"] <- 2
sex <- as.numeric(sex)

###----------------###
### Write to files ###
###----------------###

### This study uses the "subject_id" ID field in the genotype files,
### so use that here, instead of the dbGaP_Subject_ID
phenotypes <- data.frame(fid = pheno$subject_id,
                     iid = pheno$subject_id,
                     patid = "x",
                     matid = "x",
                     sex = sex,
                     si = si,
                     sc = sc)

write.table(phenotypes,
            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_phenotypes.ped",
            row.names=F,
            quote = F,
            sep="\t")


covariates  <- data.frame(fid = pheno$subject_id,
                    iid = pheno$subject_id,
                    patid = "x",
                    matid = "x",
                    sex = sex,
                    age = pheno$age,
                    age2 = pheno$age^2,
                    AFFECTION_STATUS = pheno$AFFECTION_STATUS)

write.table(covariates,
            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_covariates.ped",
            row.names=F,
            quote = F,
            sep="\t") ### save table


====== BEAGESS ======


===== Phenotypes =====


options(stringsAsFactors=F)

beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)

genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F)


### The file reads into R incorrectly because of a weird trailing tab
### character in the data file, so use the below code to shift column
### names to the correct column.
names(beagess)[1] <- "XXX"
beagess$dbGaP_Subject_ID <- row.names(beagess)
beagess$assocEABEvsCO <- NULL
names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID")


### SMOKING INITIATION
###
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
###    "Cigarette smoking status."
###    Response options are "-99", "-9", "0", "1" and "2".
###       -99 = not consented
###       -9 = Missing
###        0 = Never
###        1 = Former
###        2 = Current
###
### Descriptives:
###
### > table(beagess_data$BMI)
###
### -99   -9    0    1    2
### 494 2051 1515 2288  575
###
### > summary(beagess_data$BMI)
### Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
### -99.000  -9.000   0.000  -9.234   1.000   2.000

si <- beagess$cig_smk_status
si[si == 1 | si == 2] <- 2
si[si == 0] <- 1
si[si != 1 & si != 2] <- NA

### SMOKING Cessation
###
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
###    "Cigarette smoking status."
###    Response options are "-99", "-9", "0", "1" and "2".
###       -99 = not consented
###       -9 = Missing
###        0 = Never
###        1 = Former
###        2 = Current
###
### Descriptives:
### table(beagess$cig_smk_status)
###  -99   -9    0    1    2 
###  494 2051 1515 2288  575 
###
### Current == 2 & Former == 1 in GSCAN. This is already the case for these data.

sc <- beagess$cig_smk_status
sc[sc == 0 | sc == -9 | sc == -99] <- NA

### BEAGESS variable name is "agegpcat"
###   "Age in 5 year categories"
###   Response options are integers 1 through 14 or -9.
###       -9 = Missing
###       1 = 15-29 years of age
###       2 = 30-34 years of age
###       3 = 35-39 years of age
###       4 = 40-44 years of age
###       5 = 45-49 years of age
###       6 = 50-54 years of age
###       7 = 55-59 years of age
###       8 = 60-64 years of age
###       9 = 65-69 years of age
###       10 = 70-74 years of age
###       11 = 75-79 years of age
###       12 = 80-84 years of age
###       13 = 85-89 years of age
###       14 = 90-100 years of age
###
### Descriptives:
###
### > table(beagess_data$agegpcat)
###
### -9    1    2    3    4    5    6    7    8    9   10   11   12   13   14
### 27   19   48  112  190  378  657  958 1134 1238 1018  794  246   87   17
###
### This is fine, but change the -9 to NA
beagess$agegpcat[beagess$agegpcat == -9] <- NA


### looks like BEAGESS uses their own internal "SUBJECT_ID" in the
### genotype files, so we'll use that in our pedigree files
phenotypes <- data.frame(fid = beagess$SUBJECT_ID,
                         iid = beagess$SUBJECT_ID,
                         patid = "x",
                         matid = "x",
                         sex = beagess$sex,
                         si = si,
                         sc = sc)

phenotypes[is.na(phenotypes)] <- "x"

write.table(phenotypes,
            "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped",
            row.names=F,
            quote = F,
            sep="\t")


covariates  <- data.frame(fid = beagess$SUBJECT_ID,
                          iid = beagess$SUBJECT_ID,
                          patid = "x",
                          matid = "x",
                          sex = beagess$sex,
                          age = beagess$age,
                          age2 = beagess$age^2,
                          Barretts.esophagus.case_v_control = beagess$assocBEvsCO,
                          Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO)

covariates[is.na(covariates)] <- "x"

write.table(covariates,
                        "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped",
                        row.names=F,
                        quote = F,
                        sep="\t") ### save table                 


====== Jackson Heart Study ======


===== Phenotypes =====


hmb <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.JHS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz", header=T, sep="\t", stringsAsFactors=F)
npu <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz", header=T, sep="\t", stringsAsFactors=F)
phens <- rbind(hmb, npu)

hmb_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.fam", header=F, stringsAsFactors=F)
npu_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.fam", header=F, stringsAsFactor=F)
geno <- rbind(hmb_geno, npu_geno)

jhs_map <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.MULTI/JHS_subject_sample_map_8_10_2010.txt", header=T, sep="\t", stringsAsFactors=F)
jhs_pcs <- read.table("/rc_scratch/hayo0753/JHS_files/JHS_ancestry_and_pcs.txt", header=T, stringsAsFactors=F)

names(jhs_pcs) [2] <- "SUBJID"
names(geno) <- c("SUBJID", "iid", "patid", "matid", "sex", "phen")
filtered_geno <- geno[(geno$SUBJID %in% phens$SUBJID),]
filtered_phens <- phens[(phens$SUBJID %in% geno$SUBJID),]

mapped_geno <- merge(filtered_phens, filtered_geno, by="SUBJID", all.x=T)
mapped_geno_pcs <- merge(mapped_geno, jhs_pcs, by="SUBJID", all.x=T)

covariates <- mapped_geno_pcs[c(1,69,70:72,5,75:84)]
names(covariates) [1] <- "fid"
names(covariates) [2] <- "iid"
names(covariates) [6] <- "age"

###--------------------###
### Smoking initiation ###
###--------------------###
### 
### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
###       current_smoker_baseline = Current smoking status at first participant visit
###       former_smoker_baseline = Former smoking status at first participant visit
###
### Response options are
###     0 - No
###     1 - Yes
###
### table(si)
###
###    0    1    x
### 1206 537  9

current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))
former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline", "SUBJID"))
N <- nrow(mapped_geno_pcs)
si <- rep(NA, N)
for(i in 1:N){
  if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
    si[i] <- "x"
  }
  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 0){
    si[i] <- "1"
  }
  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 1){
    si[i] <- 2 ### former smokers are coded as 2
  }
 else if (current.smoker[i,1]  == 1 & former.smoker[i,1] == 0){
    si[i] <- 2 ### current smokers are coded as 2
  }	
}
mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs)


###--------------------###
### Smoking Cessation ###
###--------------------###
### 
### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
###       current_smoker_baseline = Current smoking status at first participant visit
###       former_smoker_baseline = Former smoking status at first participant visit
###
### Response options are
###     0 - No
###     1 - Yes
###
### table(sc)
###
###    0    1    x
### 1206 537  9


current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))
former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline","SUBJID"))
N <- nrow(mapped_geno_pcs)
sc <- rep(NA, N)
for(i in 1:N){
  if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
    sc[i] <- "x"
  }
  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 0){
    sc[i] <- "x"
  }
  else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 1){
    sc[i] <- 1 ### former smokers are coded as 1
  }
 else if (current.smoker[i,1]  == 1 & former.smoker[i,1] == 0){
    sc[i] <- 2 ### current smokers are coded as 2
  }
}
phenotypes <- cbind(sc, mapped_geno_pcs_phen)

phenotypes <- phenotypes[c(3,71,72:74,1:2)]
names(phenotypes)[1] <- "fid"

write.table(covariates, "AFR.JHS.covariates.ped", row.names=F, quote=F)
write.table(phenotypes, "AFR.JHS.phenotypes.ped", row.names=F, quote=F)


======= Genotype Processing =======


===== Pre-Phasing QC =====

QC parameters that we chose:
MAF > 0.01

SNP callrate > 0.95

Missingness per individual > 0.95

HWE = 0.05 / number of markers but greater than 5e-8

 
## To update the strand builds: http://www.well.ox.ac.uk/~wrayner/strand/
## Check strands against latest 1000G: http://www.well.ox.ac.uk/~wrayner/tools/
perl HRC-1000G-check-bim.pl \
     -b ARIC_b37_filtered.bim \
     -f ARIC_b37_filtered.frq \
     -r  1000GP_Phase3_combined.legend \
     -g \
     -p EUR


## Phasing using shapeit (ARIC used as an example)
shapeit -B ARIC_b37_filtered-updated-chr${1} -M /rc_scratch/meli7712/dbGAP/1000GP_Phase3/genetic_map_chr${1}_combined_b37.txt -O phased/ARIC_b37_filtered-updated-chr${1}.phased -T 48
 

## To convert the shapeit output into vcf
shapeit -convert --input-haps mesa-chr${1}.phased \
                 --output-vcf mesa-chr${1}.phased.vcf \
                 -T 12

## Imputation
Minimac3-omp3 --haps mesa-chr${1}.phased.vcf \ 
         --cpus 48 
         --refHaps /rc_scratch/meli7712/dbGAP/references/${1}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz \
         --chr ${1} \
         --noPhoneHome \
         --format GT,DS,GP \
         --allTypedSites \
         --prefix mesa-chr${1}.phased.imputed