#load in the workshop materials from terminal

#cp -r /faculty/andrew/GenomicSEM_practical/ ./

#STEP 0: load in GenomicSEM
#load in the devtools package to load R packages from github
#not necessary as Genomic SEM is already installed
#but included here for when you run this package on your own computer
#install.packages("devtools")
#library(devtools)
  
#install the package
#again already done for you for the workshop
#install_github("GenomicSEM/GenomicSEM")
  
#load in the package
require(GenomicSEM)

#if unable to access materials on server all of these materials are also available at:
#https://utexas.box.com/s/sounavy84gwygj0j2askcyaoo2ostbgu

#set working directory (not necessary if creating "New Project" per the slides)
#setwd("~/GenomicSEM_practical")

##############################################
#	Run Common Factor Model						 #
##############################################

#STEP 1: MUNGE THE FILES 
##PLEASE NOTE: We are using a subset of 1k SNPs for the munge and ldsc functions
#in practice you will NOT use this subset but will just feed in the full set of summary 
#statistics you download online. This is just for the purposes of the tutorial and you getting a 
#sense of what to look for. We will load in a premade ldsc object using a full set of SNPs
#before moving on to Step 3

#Munge takes four necessary arguments:

#1. files = the name of the summary statistics files
files<-c("SCZ_subset.txt", "BIP_subset.txt", "MDD_subset.txt")

#2. hm3 = the name of the reference file to use for alligning effects to same ref allele across traits
#can be found on our github
hm3<-"eur_w_ld_chr/w_hm3.snplist"

#3. trait.names = names used to create the .sumstats.gz output files
trait.names<-c("SCZ","BIP","MDD")

#4. N = total sample size for traits
N<-c(105318, 16731, 173005)

#Run the munge function. This will create three .sumstats.gz files (e.g., SCZ.sumstats.gz).
munge(files=files,hm3=hm3, trait.names=trait.names, N=N)


#STEP 2: RUN LD-SCORE REGRESSION 
#Takes five necessary arguments:
#1. traits = the name of the .sumstats.gz traits
traits<- c("SCZ.sumstats.gz", "BIP.sumstats.gz", "MDD.sumstats.gz")

#2. sample.prev = the proportion of cases to total sample size. For quantitative traits list NA
sample.prev <- c(.39,.45,.35)

#3. population.prev = the population lifetime prevalence of the traits. For quantitative traits list NA
population.prev <- c(.01,.01,.16)

##4. ld = folder of LD scores 
#can be found on our github (https://github.com/GenomicSEM/GenomicSEM)
ld <- "eur_w_ld_chr/"

#5. wld = folder of LD scores
wld <- "eur_w_ld_chr/"

#6. trait.names = optional sixth  argument to list trait names so they can be named in your model
trait.names<-c("SCZ", "BIP", "MDD")

#Run the ldsc function 
COV_EXAMPLE<-ldsc(traits=traits, sample.prev=sample.prev, population.prev=population.prev, ld=ld, wld=wld,trait.names=trait.names)

##in many cases may want to save the data for later use. Example code provided here
#save(COV_EXAMPLE, file="COV_EXAMPLE.RData")

#STEP 3: ESTIMATE THE COMMON FACTOR MODEL
##going to first load in premade LDSC objects
load("GenomicSEMPractical.RData")

#requires only one necessary argument:
#1. covstruc = the output from the ldsc function
covstruc<-PSYCH_COV

#an optional second argument can be provided for the estimation method
estimation<-"DWLS"

#run the commonfactor model below
PFactor <- commonfactor(covstruc=covstruc,estimation=estimation)

#Print PFactor results
PFactor$results


#################################################################
###Pause here: We will discuss the next steps as a group.
###feel free to grab a coffee or take a minute if you finished early
###################################################################


##############################################
#	Run User Specified Model: Part 1				 #
##############################################

#STEP 3: SPECIFY AND RUN USER MODEL
#Takes two necessary arguments:
#1. covstruc = the output from multivariable ldsc 
#in this example = ldsc results for Schizophrenia, Bipolar, MDD, EA, and Insomnia
covstruc<-PRAC_COV

#2. model = the user specified model
##delete to write your own!!
MY.model<-"F1=~NA*SCZ+BIP+MDD
F1~~1*F1
INSOM~F1
EA~INSOM"

#estimation = an optional third argument specifying the estimation method to use
estimation<-"DWLS"

#std.lv = optional fourth argument specifying whether variances of latent variables should be set to 1
std.lv=FALSE

#Run your model
YourModel<- usermodel(covstruc=covstruc, model=MY.model,estimation=estimation,std.lv=std.lv)

#print the results of your model
YourModel$results

#print the model fit of your model
YourModel$modelfit


#################################################################
###Pause here: We will discuss the next steps as a group.
###feel free to grab a coffee or take a minute if you finished early
###################################################################

##############################################
#     Multivariate GWAS in Genomic SEM       #
##############################################

#Note that Steps 1 (munge) and 2 (ldsc) mirror those from the first example

#STEP 3: PREPARE SUMMARY STATISTICS FOR MULTIVARIATE GWAS
#Takes four necessary arguments:
#1. files = the name of the summary statistics file
##**note that these again are the drastically reduced subsets of SNPs for the practical ONLY
files<-c("SCZ_subset.txt", "BIP_subset.txt", "MDD_subset.txt")

#2. ref = the name of the reference file used to obtain SNP MAF
#**note again that this is a drastically reduced subset of SNPs
#the full reference set is available on our github
ref="reference.1000G.subset.txt"

#3. trait.names = the name of the files to be used in 
trait.names=c("SCZ","BIP","MDD")

#4. se.logit = whether the standard errors are on an logistic scale
se.logit<-c(T,T,T)

#there are additional arguments (e.g., OLS, lpm) if you're traits are continuous
#or binary analyzed using linear model. Please see github for more details

#run the sumstats function below
p_sumstats <- sumstats(files=files,ref=ref ,trait.names=trait.names,se.logit=se.logit)


#STEP 4a: RUN THE MULTIVARIATE GWAS
#commonfactorGWAS takes only two necessary arguments
#1. covstruc = the output from the ldsc function
covstruc<-PSYCH_COV

#2. SNPs = output from sumstats function
SNPs<-p_sumstats

#3. estimation = optional third argument specifying estimation method to be used
estimation<-"DWLS"

#4. parallel = optional argument specifying whether it should be run in parallel
#set to FALSE here just for the practical
parallel<-FALSE

#run the multivariate GWAS below
pfactor_GWAS<-commonfactorGWAS(covstruc=covstruc, SNPs=SNPs, estimation = estimation,parallel=parallel)

##print the first five rows of the output
pfactor_GWAS[1:5,]

##look at fail messages (0 = good to go)
table(pfactor_GWAS$fail)

#look at warning messages
table(pfactor_GWAS$warning)


#STEP 4b: RUN A USER SPECIFIED MULTIVARIATE GWAS
#userGWAS takes three necessary arguments:
#1. covstruc = the output from the ldsc function
covstruc<-PSYCH_COV

#2. SNPs = output from sumstats function
SNPs<-p_sumstats

#3. model = the model to be run
#going to troubleshoot estimated ov variances are negative for 4 SNPs 
#by adding model constraint for all residuals to be above 0
model<-"F1=~SCZ+BIP+MDD
F1~SNP
SCZ~~a*SCZ
BIP~~b*BIP
MDD~~c*MDD
a > .001
b > .001
c > .001"

#4. estimation = optional argument specifying estimation method to be used
estimation<-"DWLS"

#5. sub = optional argument specifying component of model output to be saved
sub<-"F1~SNP"

#6. parallel = optional argument specifying whether it should be run in parallel
#set to FALSE here just for the practical
parallel<-FALSE

#run the multivariate GWAS below
pfactor_GWAS2<-userGWAS(covstruc=covstruc, SNPs=SNPs, model=model,estimation = estimation,sub=sub,parallel=parallel)

##print the first five rows of the userGWAS output
pfactor_GWAS2[[1]][1:5,]

##see if we solved the negative observed variable (ov) variances problem
table(pfactor_GWAS2[[1]]$warning)


#################################################################
###If you finish early feel free to go onto next section.
#this is a larger set of traits that you can use to play around with 
#get comfortable with lavaan syntax and just get used to the code
###################################################################

#load in the ldsc object for anthropometric traits
load("Anthro_LDSC.RData")

#look at the column names so you know how to specify the traits
#in your model
colnames(anthro$S)

#1. covstruc = the output from the ldsc function
covstruc<-anthro

#2. model = the user specified model
#WRITE YOUR OWN. 
anthro.model<-""

#estimation = an optional third argument specifying the estimation method to use
estimation<-"DWLS"

#std.lv = optional fourth argument specifying whether variances of latent variables should be set to 1
std.lv=FALSE

#Run your model
AnthroModel<- usermodel(covstruc=covstruc, model=anthro.model,estimation=estimation,std.lv=std.lv)