#load in the workshop materials from terminal #cp -r /faculty/andrew/GenomicSEM_practical/ ./ #STEP 0: load in GenomicSEM #load in the devtools package to load R packages from github #not necessary as Genomic SEM is already installed #but included here for when you run this package on your own computer #install.packages("devtools") #library(devtools) #install the package #again already done for you for the workshop #install_github("GenomicSEM/GenomicSEM") #load in the package require(GenomicSEM) #if unable to access materials on server all of these materials are also available at: #https://utexas.box.com/s/sounavy84gwygj0j2askcyaoo2ostbgu #set working directory (not necessary if creating "New Project" per the slides) #setwd("~/GenomicSEM_practical") ############################################## # Run Common Factor Model # ############################################## #STEP 1: MUNGE THE FILES ##PLEASE NOTE: We are using a subset of 1k SNPs for the munge and ldsc functions #in practice you will NOT use this subset but will just feed in the full set of summary #statistics you download online. This is just for the purposes of the tutorial and you getting a #sense of what to look for. We will load in a premade ldsc object using a full set of SNPs #before moving on to Step 3 #Munge takes four necessary arguments: #1. files = the name of the summary statistics files files<-c("SCZ_subset.txt", "BIP_subset.txt", "MDD_subset.txt") #2. hm3 = the name of the reference file to use for alligning effects to same ref allele across traits #can be found on our github hm3<-"eur_w_ld_chr/w_hm3.snplist" #3. trait.names = names used to create the .sumstats.gz output files trait.names<-c("SCZ","BIP","MDD") #4. N = total sample size for traits N<-c(105318, 16731, 173005) #Run the munge function. This will create three .sumstats.gz files (e.g., SCZ.sumstats.gz). munge(files=files,hm3=hm3, trait.names=trait.names, N=N) #STEP 2: RUN LD-SCORE REGRESSION #Takes five necessary arguments: #1. traits = the name of the .sumstats.gz traits traits<- c("SCZ.sumstats.gz", "BIP.sumstats.gz", "MDD.sumstats.gz") #2. sample.prev = the proportion of cases to total sample size. For quantitative traits list NA sample.prev <- c(.39,.45,.35) #3. population.prev = the population lifetime prevalence of the traits. For quantitative traits list NA population.prev <- c(.01,.01,.16) ##4. ld = folder of LD scores #can be found on our github (https://github.com/GenomicSEM/GenomicSEM) ld <- "eur_w_ld_chr/" #5. wld = folder of LD scores wld <- "eur_w_ld_chr/" #6. trait.names = optional sixth argument to list trait names so they can be named in your model trait.names<-c("SCZ", "BIP", "MDD") #Run the ldsc function COV_EXAMPLE<-ldsc(traits=traits, sample.prev=sample.prev, population.prev=population.prev, ld=ld, wld=wld,trait.names=trait.names) ##in many cases may want to save the data for later use. Example code provided here #save(COV_EXAMPLE, file="COV_EXAMPLE.RData") #STEP 3: ESTIMATE THE COMMON FACTOR MODEL ##going to first load in premade LDSC objects load("GenomicSEMPractical.RData") #requires only one necessary argument: #1. covstruc = the output from the ldsc function covstruc<-PSYCH_COV #an optional second argument can be provided for the estimation method estimation<-"DWLS" #run the commonfactor model below PFactor <- commonfactor(covstruc=covstruc,estimation=estimation) #Print PFactor results PFactor$results ################################################################# ###Pause here: We will discuss the next steps as a group. ###feel free to grab a coffee or take a minute if you finished early ################################################################### ############################################## # Run User Specified Model: Part 1 # ############################################## #STEP 3: SPECIFY AND RUN USER MODEL #Takes two necessary arguments: #1. covstruc = the output from multivariable ldsc #in this example = ldsc results for Schizophrenia, Bipolar, MDD, EA, and Insomnia covstruc<-PRAC_COV #2. model = the user specified model ##delete to write your own!! MY.model<-"F1=~NA*SCZ+BIP+MDD F1~~1*F1 INSOM~F1 EA~INSOM" #estimation = an optional third argument specifying the estimation method to use estimation<-"DWLS" #std.lv = optional fourth argument specifying whether variances of latent variables should be set to 1 std.lv=FALSE #Run your model YourModel<- usermodel(covstruc=covstruc, model=MY.model,estimation=estimation,std.lv=std.lv) #print the results of your model YourModel$results #print the model fit of your model YourModel$modelfit ################################################################# ###Pause here: We will discuss the next steps as a group. ###feel free to grab a coffee or take a minute if you finished early ################################################################### ############################################## # Multivariate GWAS in Genomic SEM # ############################################## #Note that Steps 1 (munge) and 2 (ldsc) mirror those from the first example #STEP 3: PREPARE SUMMARY STATISTICS FOR MULTIVARIATE GWAS #Takes four necessary arguments: #1. files = the name of the summary statistics file ##**note that these again are the drastically reduced subsets of SNPs for the practical ONLY files<-c("SCZ_subset.txt", "BIP_subset.txt", "MDD_subset.txt") #2. ref = the name of the reference file used to obtain SNP MAF #**note again that this is a drastically reduced subset of SNPs #the full reference set is available on our github ref="reference.1000G.subset.txt" #3. trait.names = the name of the files to be used in trait.names=c("SCZ","BIP","MDD") #4. se.logit = whether the standard errors are on an logistic scale se.logit<-c(T,T,T) #there are additional arguments (e.g., OLS, lpm) if you're traits are continuous #or binary analyzed using linear model. Please see github for more details #run the sumstats function below p_sumstats <- sumstats(files=files,ref=ref ,trait.names=trait.names,se.logit=se.logit) #STEP 4a: RUN THE MULTIVARIATE GWAS #commonfactorGWAS takes only two necessary arguments #1. covstruc = the output from the ldsc function covstruc<-PSYCH_COV #2. SNPs = output from sumstats function SNPs<-p_sumstats #3. estimation = optional third argument specifying estimation method to be used estimation<-"DWLS" #4. parallel = optional argument specifying whether it should be run in parallel #set to FALSE here just for the practical parallel<-FALSE #run the multivariate GWAS below pfactor_GWAS<-commonfactorGWAS(covstruc=covstruc, SNPs=SNPs, estimation = estimation,parallel=parallel) ##print the first five rows of the output pfactor_GWAS[1:5,] ##look at fail messages (0 = good to go) table(pfactor_GWAS$fail) #look at warning messages table(pfactor_GWAS$warning) #STEP 4b: RUN A USER SPECIFIED MULTIVARIATE GWAS #userGWAS takes three necessary arguments: #1. covstruc = the output from the ldsc function covstruc<-PSYCH_COV #2. SNPs = output from sumstats function SNPs<-p_sumstats #3. model = the model to be run #going to troubleshoot estimated ov variances are negative for 4 SNPs #by adding model constraint for all residuals to be above 0 model<-"F1=~SCZ+BIP+MDD F1~SNP SCZ~~a*SCZ BIP~~b*BIP MDD~~c*MDD a > .001 b > .001 c > .001" #4. estimation = optional argument specifying estimation method to be used estimation<-"DWLS" #5. sub = optional argument specifying component of model output to be saved sub<-"F1~SNP" #6. parallel = optional argument specifying whether it should be run in parallel #set to FALSE here just for the practical parallel<-FALSE #run the multivariate GWAS below pfactor_GWAS2<-userGWAS(covstruc=covstruc, SNPs=SNPs, model=model,estimation = estimation,sub=sub,parallel=parallel) ##print the first five rows of the userGWAS output pfactor_GWAS2[[1]][1:5,] ##see if we solved the negative observed variable (ov) variances problem table(pfactor_GWAS2[[1]]$warning) ################################################################# ###If you finish early feel free to go onto next section. #this is a larger set of traits that you can use to play around with #get comfortable with lavaan syntax and just get used to the code ################################################################### #load in the ldsc object for anthropometric traits load("Anthro_LDSC.RData") #look at the column names so you know how to specify the traits #in your model colnames(anthro$S) #1. covstruc = the output from the ldsc function covstruc<-anthro #2. model = the user specified model #WRITE YOUR OWN. anthro.model<-"" #estimation = an optional third argument specifying the estimation method to use estimation<-"DWLS" #std.lv = optional fourth argument specifying whether variances of latent variables should be set to 1 std.lv=FALSE #Run your model AnthroModel<- usermodel(covstruc=covstruc, model=anthro.model,estimation=estimation,std.lv=std.lv)