##############################################
#	Run Common Factor Model						 #
##############################################

#STEP 0: load in GenomicSEM
#load in the devtools package to load R packages from github
#install.packages("devtools")
#library(devtools)

#install the package
#install_github("MichelNivard/GenomicSEM")

#load in the package
require(GenomicSEM)
getwd()
#load in the example ldsc objects that we will use for the practical
setwd("/home/andrew/GenomicSEM_BoulderWorkshop2020/")

load("GenomicSEMPractical.RData")

#STEP 1: MUNGE THE FILES (done for you already)
#Takes four necessary arguments:

#1. files = the name of the summary statistics files
#setwd("~/Downloads/MichiganWorkshop_GenomicSEMFiles/")
#files<-c("SCZ_HM3.txt", "BIP_HM3.txt", "MDD_HM3.txt")

#2. hm3 = the name of the reference file to use for alligning effects to same ref allele across traits
#can be found on our github
#hm3<-"w_hm3.noMHC.snplist"

#3. trait.names = names used to create the .sumstats.gz output files
#trait.names<-c("SCZ_HM3","BIP_HM3","MDD_HM3")

#4. N = total sample size for traits
#N<-c(105318, 16731, 173005)

#Run the munge function. This will create three .sumstats.gz files (e.g., SCZ_HM3.sumstats.gz).
#munge(files=files,hm3=hm3, trait.names=trait.names, N=N)

#STEP 2: RUN LD-SCORE REGRESSION (also done)
#Takes five necessary arguments:
#1. traits = the name of the .sumstats.gz traits
#traits<- c("SCZ_HM3.sumstats.gz", "BIP_HM3.sumstats.gz", "MDD_HM3.sumstats.gz")

#2. sample.prev = the proportion of cases to total sample size. For quantitative traits list NA
#sample.prev <- c(.39,.45,.35)

#3. population.prev = the population lifetime prevalence of the traits. For quantitative traits list NA
#population.prev <- c(.01,.01,.16)

##4. ld = folder of LD scores 
#can be found on our github
#ld <- "eur_w_ld_chr/"

#5. wld = folder of LD scores
#can be found on our github
#wld <- "eur_w_ld_chr/"

#6. trait.names = optional sixth  argument to list trait names so they can be named in your model
#trait.names<-c("SCZ", "BIP", "MDD")

#Run the ldsc function 
#PSYCH_COV<-ldsc(traits=traits, sample.prev=sample.prev, population.prev=population.prev, ld=ld, wld=wld,trait.names=trait.names)

##in many cases may want to save the data for later use. Example code provided here
#save(PSYCH_COV, file="PSYCH.RData")

#STEP 3: ESTIMATE THE COMMON FACTOR MODEL
#requires only one necessary argument:
#1. covstruc = the output from the ldsc function
covstruc<-PSYCH_COV

#an optional second argument can be provided for the estimation method
estimation<-"DWLS"

#run the commonfactor model below
PFactor <- commonfactor(covstruc=covstruc,estimation=estimation)

#Print PFactor results
PFactor$results


##############################################
#	Run User Specified Model: Part 1				 #
##############################################

#STEP 3: SPECIFY AND RUN USER MODEL
#Takes two necessary arguments:
#1. covstruc = the output from multivariable ldsc 
#in this example = ldsc results for Schizophrenia, Bipolar, MDD, EA, and Insomnia
covstruc<-PRAC_COV

#2. model = the user specified model
##delete to write your own!!
MY.model<-"F1=~NA*SCZ+BIP+MDD
F1~~1*F1
INSOM~F1
EA~INSOM"

#estimation = an optional third argument specifying the estimation method to use
estimation<-"DWLS"

#std.lv = optional fourth argument specifying whether variances of latent variables should be set to 1
std.lv=FALSE

#Run your model
YourModel<- usermodel(covstruc=covstruc, model=MY.model,estimation=estimation,std.lv=std.lv)

#print the results of your model
YourModel$results

#print the model fit of your model
YourModel$modelfit


##############################################
#     Multivariate GWAS in Genomic SEM       #
##############################################

#Note that Steps 1 and 2 mirror those from the first example

#STEP 3: PREPARE SUMMARY STATISTICS FOR MULTIVARIATE GWAS
#Takes four necessary arguments:
#1. files = the name of the summary statistics file
##**note that these are drastically reduced subsets of SNPs for the practical only
files<-c("SCZ_100.txt", "BIP_100.txt", "MDD_100.txt")

#2. ref = the name of the reference file used to obtain SNP MAF
#**note again that this is a drastically reduced subset of SNPs
ref="reference.1000G.subset.txt"

#3. trait.names = the name of the files to be used in 
trait.names=c("SCZ","BIP","MDD")

#4. se.logit = whether the standard errors are on an logistic scale
se.logit<-c(T,T,T)

#run the sumstats function below
p_sumstats <- sumstats(files=files,ref=ref ,trait.names=trait.names,se.logit=se.logit)

#STEP 4a: RUN THE MULTIVARIATE GWAS
#commonfactorGWAS takes only two necessary arguments
#1. covstruc = the output from the ldsc function
covstruc<-PSYCH_COV

#2. SNPs = output from sumstats function
SNPs<-p_sumstats

#3. estimation = optional third argument specifying estimation method to be used
estimation<-"DWLS"

#4. parallel = optional argument specifying whether it should be run in parallel
#set to FALSE here just for the practical
parallel<-FALSE

#5. SNPSE = optional argument specifying value of standard error for SNP
SNPSE<-.005

#run the multivariate GWAS below
pfactor_GWAS<-commonfactorGWAS(covstruc=covstruc, SNPs=SNPs, estimation = estimation,parallel=parallel,SNPSE=SNPSE)

##print the first five rows of the output
pfactor_GWAS[1:5,]

##look at fail messages (0 = good to go)
table(pfactor_GWAS$fail)

#look at warning messages
table(pfactor_GWAS$warning)


#STEP 4b: RUN A USER SPECIFIED MULTIVARIATE GWAS
#userGWAS takes three necessary arguments:
#1. covstruc = the output from the ldsc function
covstruc<-PSYCH_COV

#2. SNPs = output from sumstats function
SNPs<-p_sumstats

#3. model = the model to be run
#going to troubleshoot estimated ov variances are negative 
#by adding model constraint for all residuals to be above 0
model<-"F1=~SCZ+BIP+MDD
F1~SNP
SCZ~~a*SCZ
BIP~~b*BIP
MDD~~c*MDD
a > .001
b > .001
c > .001"

#4. modelchi = optional argument whether you want model chi-square for the individual model
#default = FALSE
modelchi<-FALSE

#5. estimation = optional argument specifying estimation method to be used
estimation<-"DWLS"

#6. sub = optional argument specifying component of model output to be saved
sub<-"F1~SNP"

#7. SNPSE = optional argument specifying value of standard error for SNP
SNPSE<-.005

#8. parallel = optional argument specifying whether it should be run in parallel
#set to FALSE here just for the practical
parallel<-FALSE

#run the multivariate GWAS below
pfactor_GWAS2<-userGWAS(covstruc=covstruc, SNPs=SNPs, model=model,modelchi=modelchi,
                        estimation = estimation,sub=sub,SNPSE=SNPSE,parallel=parallel)

##print the first five rows of the userGWAS output
pfactor_GWAS2[[1]][1:5,]

##see if we solved the warning problem
table(pfactor_GWAS2[[1]]$warning)


###IF theres time
load("Anthro_LDSC.RData")
colnames(anthro$S)

covstruc<-anthro

#2. model = the user specified model
anthro.model<-""

#estimation = an optional third argument specifying the estimation method to use
estimation<-"DWLS"

#std.lv = optional fourth argument specifying whether variances of latent variables should be set to 1
std.lv=FALSE

#Run your model
AnthroModel<- usermodel(covstruc=covstruc, model=anthro.model,estimation=estimation,std.lv=std.lv)