#load in the workshop materials from terminal
https://qimr.az1.qualtrics.com/jfe/form/SV_72kzAByH72t9URo
#cp -r /faculty/andrew/GenomicSEM_2024/ ./

#STEP 0: load in GenomicSEM
#load in the devtools package to load R packages from github
#not necessary as Genomic SEM is already installed
#but included here for when you run this package on your own computer
#install.packages("devtools")
#library(devtools)
  
#install the package
#again already done for you for the workshop
#install_github("GenomicSEM/GenomicSEM")
  
#load in the package
require(GenomicSEM)

#set working directory (not necessary if creating "New Project" per the slides)
setwd("~/GenomicSEM_2024/")

#STEP 1: MUNGE THE FILES 
#here we are using ALCH, PTSD, MDD, and ANX. Some preprocessing for these files is described here:
#https://github.com/GenomicSEM/GenomicSEM/wiki/2.1-Calculating-Sum-of-Effective-Sample-Size-and-Preparing-GWAS-Summary-Statistics

#Munge takes four necessary arguments:

#1. files = the name of the summary statistics files
#files<-c("ALCH_withrsID.txt","SORTED_PTSD_EA9_ALL_study_specific_PCs1.txt", "MDD_withNeff.txt","ANX_withNeff.txt")

#2. hm3 = the name of the reference file to use for alligning effects to same ref allele across traits
#can be found on our github
#hm3<-"eur_w_ld_chr/w_hm3.snplist"

#3. trait.names = names used to create the .sumstats.gz output files
#trait.names<-c("ALCH", "PTSD", "MDD", "ANX")

#4. N = total sample size for traits
#N=c(NA,5831.346,NA,NA)

#Run the munge function. This creates four .sumstats.gz files (e.g., PTSD.sumstats.gz).
#munge(files=files,hm3=hm3, trait.names=trait.names, N=N)

#STEP 2: RUN LD-SCORE REGRESSION [also done for you]
#Takes five necessary arguments:
#1. traits = the name of the .sumstats.gz traits
#traits<-c("ALCH.sumstats.gz", "PTSD.sumstats.gz", "MDD.sumstats.gz", "ANX.sumstats.gz")

#2. sample.prev = the proportion of cases to total sample size. For quantitative traits list NA
#here we use .5 because we are using the sum of effective sample size across cohorts
#which is more appropriate for performing the asscertainment correction. 
#sample.prev <- c(.5,.5,.5,.5)

#3. population.prev : the population lifetime prevalence of the traits. For quantitative traits list NA
#population.prev <-c(.159, .3, .15,.20)

##4. ld: folder of LD scores 
#can be found on our github (https://github.com/GenomicSEM/GenomicSEM)
#ld <- "eur_w_ld_chr/"

#5. wld: folder of LD score weights [typically the same as the ld argument]
#wld <- "eur_w_ld_chr/"

#6. trait.names: optional sixth  argument to list trait names so they can be named in your model
#trait.names<-c("ALCH", "PTSD", "MDD", "ANX")

#Run the ldsc function 
#LDSC_INT<-ldsc(traits=traits, sample.prev=sample.prev, population.prev=population.prev, ld=ld, wld=wld,trait.names=trait.names)

##in many cases will want to save the LDSC data for later use. Example code provided here
#save(LDSC_INT, file = "LDSC_INT.RData")


#STEP 3: SPECIFY AND RUN USER MODEL
#here we will run a common factor model of the internalizing traits

#load in the LDSC object made in step 2 above
load("LDSC_INT.RData")

#Takes two necessary arguments:
#1. covstruc: the output from multivariable ldsc 
covstruc<-LDSC_INT

#2. model: the user specified model
#here we specify a common factor model
INT.model<-"F1=~MDD+PTSD+ALCH+ANX"

#std.lv: optional argument specifying whether variances of latent variables should be set to 1
std.lv=TRUE

#Run your model
IntResults <- usermodel(covstruc=covstruc, model=INT.model,std.lv=std.lv)

#print the results of your model
IntResults$results

#print the model fit of your model
IntResults$modelfit

##############################################
#     Multivariate GWAS in Genomic SEM       #
##############################################

#STEP 3 within multivariate GWAS: PREPARE SUMMARY STATISTICS 
#Takes four necessary arguments along with other optional arguments that 
#you may need for your use case. Here we need six arguments to run the function 

#1. files = the name of the summary statistics file
##**note that these are a drastically reduced subsets of SNPs for the practical ONLY
##*that reflect a selected set of chromosome 4 variants 
##*Also note that these need to be in the same order as your ldsc object 
files<-c("ALCH4.txt", "PTSD4.txt", "MDD4.txt", "ANX4.txt")

#2. ref = the name of the reference file used to obtain SNP MAF
#**note again that this is a drastically reduced subset of SNPs for chromosome 4
#the full reference set is available on our github
ref <- "reference.1000G.ch4.txt" 

#3. trait.names = the name of the traits to use for column labeling
trait.names<-c("ALCH","PTSD", "MDD", "ANX")

#4. se.logit = whether the standard errors are on an logistic scale
se.logit<-c(F,T,T,T)

#5. linprob: whether it was a binary outcome that was analyzed as continuoue -or-
#it is a file with only Z-statistics. This is true for ALCH for our data
linprob<-c(T,F,F,F)

#6. sample size. This is only needed for continuous outcomes or outcomes where linprob is TRUE
#we do not provide sample size for ALCH as it is already a column in the GWAS data 
N<-c(NA,NA,NA,NA)

#run sumstats putting these pieces together 
INT_sumstats<-sumstats(files=files,ref=ref,trait.names=trait.names,se.logit=se.logit,linprob=linprob,N=N)

#would typically the output for future use
#write.csv(INT_sumstats, file = "INT4_sumstats.csv")

#STEP 4: RUN A USER SPECIFIED MULTIVARIATE GWAS
#userGWAS takes three necessary arguments:
#1. covstruc = the output from the ldsc function
covstruc<-LDSC_INT

#2. SNPs = output from sumstats function
SNPs<-INT_sumstats

#3. model = the model to be run
#going to troubleshoot estimated ov variances are negative for 4 SNPs 
#by adding model constraint for all residuals to be above 0
model<-"F1=~MDD+PTSD+ALCH+ANX
F1~SNP"

#4. sub = optional argument specifying component of model output to be saved
sub<-"F1~SNP"

#5. parallel = optional argument specifying whether it should be run in parallel
#set to FALSE here just for the practical
parallel<-FALSE

#6. Q_SNP = optional argument specifying whether you want 
#the heterogeneity index calculated for your factors
Q_SNP<-TRUE

#run the multivariate GWAS below
INT_GWAS<-userGWAS(covstruc=covstruc, SNPs=SNPs, model=model, sub=sub, parallel=parallel, Q_SNP=Q_SNP)

##print the first five rows of the userGWAS output
INT_GWAS[[1]][1:5,]

#examine warnings; (0 = good to go)
table(INT_GWAS[[1]]$warning)

#examine errors; (0 = good to go)
table(INT_GWAS[[1]]$error)

#any genome-wide significant factor hits?
table(INT_GWAS[[1]]$Pval_Estimate < 5e-8)

#what about genome-wide significant Q_hits?
table(INT_GWAS[[1]]$Q_SNP_pval < 5e-8)

#trending significant Q_hits? What are these hits (what gene region are they in?)
table(INT_GWAS[[1]]$Q_SNP_pval < 1e-5)

#look at results
View(INT_GWAS[[1]])

#################################################################
###If you finish early go onto next section.
#This is a larger set of traits that you can use to play around with 
#and get more practice with lavaan syntax and interpreting results
###################################################################

#load in a premade ldsc object for a set of anthroprometric traits
load("Anthro_LDSC.RData")

#1. covstruc = the output from the ldsc function
covstruc<-anthro

#look at the column names so you know how to specify the traits
#in your model for the second argument below
#note that you do not need to use all of the traits
colnames(anthro$S)

#2. model = the user specified model
#WRITE YOUR OWN. 
Your.Model<-""

#std.lv = optional argument specifying whether variances of latent variables should be set to 1
std.lv=TRUE

#Run your model
YourResults <- usermodel(covstruc=covstruc, model=Your.Model,std.lv=std.lv)

#look at your results; what do they tell you?
YourResults$results

#does it fit the data well?
YourResults$modelfit