#load in the workshop materials from terminal https://qimr.az1.qualtrics.com/jfe/form/SV_72kzAByH72t9URo #cp -r /faculty/andrew/GenomicSEM_2024/ ./ #STEP 0: load in GenomicSEM #load in the devtools package to load R packages from github #not necessary as Genomic SEM is already installed #but included here for when you run this package on your own computer #install.packages("devtools") #library(devtools) #install the package #again already done for you for the workshop #install_github("GenomicSEM/GenomicSEM") #load in the package require(GenomicSEM) #set working directory (not necessary if creating "New Project" per the slides) setwd("~/GenomicSEM_2024/") #STEP 1: MUNGE THE FILES #here we are using ALCH, PTSD, MDD, and ANX. Some preprocessing for these files is described here: #https://github.com/GenomicSEM/GenomicSEM/wiki/2.1-Calculating-Sum-of-Effective-Sample-Size-and-Preparing-GWAS-Summary-Statistics #Munge takes four necessary arguments: #1. files = the name of the summary statistics files #files<-c("ALCH_withrsID.txt","SORTED_PTSD_EA9_ALL_study_specific_PCs1.txt", "MDD_withNeff.txt","ANX_withNeff.txt") #2. hm3 = the name of the reference file to use for alligning effects to same ref allele across traits #can be found on our github #hm3<-"eur_w_ld_chr/w_hm3.snplist" #3. trait.names = names used to create the .sumstats.gz output files #trait.names<-c("ALCH", "PTSD", "MDD", "ANX") #4. N = total sample size for traits #N=c(NA,5831.346,NA,NA) #Run the munge function. This creates four .sumstats.gz files (e.g., PTSD.sumstats.gz). #munge(files=files,hm3=hm3, trait.names=trait.names, N=N) #STEP 2: RUN LD-SCORE REGRESSION [also done for you] #Takes five necessary arguments: #1. traits = the name of the .sumstats.gz traits #traits<-c("ALCH.sumstats.gz", "PTSD.sumstats.gz", "MDD.sumstats.gz", "ANX.sumstats.gz") #2. sample.prev = the proportion of cases to total sample size. For quantitative traits list NA #here we use .5 because we are using the sum of effective sample size across cohorts #which is more appropriate for performing the asscertainment correction. #sample.prev <- c(.5,.5,.5,.5) #3. population.prev : the population lifetime prevalence of the traits. For quantitative traits list NA #population.prev <-c(.159, .3, .15,.20) ##4. ld: folder of LD scores #can be found on our github (https://github.com/GenomicSEM/GenomicSEM) #ld <- "eur_w_ld_chr/" #5. wld: folder of LD score weights [typically the same as the ld argument] #wld <- "eur_w_ld_chr/" #6. trait.names: optional sixth argument to list trait names so they can be named in your model #trait.names<-c("ALCH", "PTSD", "MDD", "ANX") #Run the ldsc function #LDSC_INT<-ldsc(traits=traits, sample.prev=sample.prev, population.prev=population.prev, ld=ld, wld=wld,trait.names=trait.names) ##in many cases will want to save the LDSC data for later use. Example code provided here #save(LDSC_INT, file = "LDSC_INT.RData") #STEP 3: SPECIFY AND RUN USER MODEL #here we will run a common factor model of the internalizing traits #load in the LDSC object made in step 2 above load("LDSC_INT.RData") #Takes two necessary arguments: #1. covstruc: the output from multivariable ldsc covstruc<-LDSC_INT #2. model: the user specified model #here we specify a common factor model INT.model<-"F1=~MDD+PTSD+ALCH+ANX" #std.lv: optional argument specifying whether variances of latent variables should be set to 1 std.lv=TRUE #Run your model IntResults <- usermodel(covstruc=covstruc, model=INT.model,std.lv=std.lv) #print the results of your model IntResults$results #print the model fit of your model IntResults$modelfit ############################################## # Multivariate GWAS in Genomic SEM # ############################################## #STEP 3 within multivariate GWAS: PREPARE SUMMARY STATISTICS #Takes four necessary arguments along with other optional arguments that #you may need for your use case. Here we need six arguments to run the function #1. files = the name of the summary statistics file ##**note that these are a drastically reduced subsets of SNPs for the practical ONLY ##*that reflect a selected set of chromosome 4 variants ##*Also note that these need to be in the same order as your ldsc object files<-c("ALCH4.txt", "PTSD4.txt", "MDD4.txt", "ANX4.txt") #2. ref = the name of the reference file used to obtain SNP MAF #**note again that this is a drastically reduced subset of SNPs for chromosome 4 #the full reference set is available on our github ref <- "reference.1000G.ch4.txt" #3. trait.names = the name of the traits to use for column labeling trait.names<-c("ALCH","PTSD", "MDD", "ANX") #4. se.logit = whether the standard errors are on an logistic scale se.logit<-c(F,T,T,T) #5. linprob: whether it was a binary outcome that was analyzed as continuoue -or- #it is a file with only Z-statistics. This is true for ALCH for our data linprob<-c(T,F,F,F) #6. sample size. This is only needed for continuous outcomes or outcomes where linprob is TRUE #we do not provide sample size for ALCH as it is already a column in the GWAS data N<-c(NA,NA,NA,NA) #run sumstats putting these pieces together INT_sumstats<-sumstats(files=files,ref=ref,trait.names=trait.names,se.logit=se.logit,linprob=linprob,N=N) #would typically the output for future use #write.csv(INT_sumstats, file = "INT4_sumstats.csv") #STEP 4: RUN A USER SPECIFIED MULTIVARIATE GWAS #userGWAS takes three necessary arguments: #1. covstruc = the output from the ldsc function covstruc<-LDSC_INT #2. SNPs = output from sumstats function SNPs<-INT_sumstats #3. model = the model to be run #going to troubleshoot estimated ov variances are negative for 4 SNPs #by adding model constraint for all residuals to be above 0 model<-"F1=~MDD+PTSD+ALCH+ANX F1~SNP" #4. sub = optional argument specifying component of model output to be saved sub<-"F1~SNP" #5. parallel = optional argument specifying whether it should be run in parallel #set to FALSE here just for the practical parallel<-FALSE #6. Q_SNP = optional argument specifying whether you want #the heterogeneity index calculated for your factors Q_SNP<-TRUE #run the multivariate GWAS below INT_GWAS<-userGWAS(covstruc=covstruc, SNPs=SNPs, model=model, sub=sub, parallel=parallel, Q_SNP=Q_SNP) ##print the first five rows of the userGWAS output INT_GWAS[[1]][1:5,] #examine warnings; (0 = good to go) table(INT_GWAS[[1]]$warning) #examine errors; (0 = good to go) table(INT_GWAS[[1]]$error) #any genome-wide significant factor hits? table(INT_GWAS[[1]]$Pval_Estimate < 5e-8) #what about genome-wide significant Q_hits? table(INT_GWAS[[1]]$Q_SNP_pval < 5e-8) #trending significant Q_hits? What are these hits (what gene region are they in?) table(INT_GWAS[[1]]$Q_SNP_pval < 1e-5) #look at results View(INT_GWAS[[1]]) ################################################################# ###If you finish early go onto next section. #This is a larger set of traits that you can use to play around with #and get more practice with lavaan syntax and interpreting results ################################################################### #load in a premade ldsc object for a set of anthroprometric traits load("Anthro_LDSC.RData") #1. covstruc = the output from the ldsc function covstruc<-anthro #look at the column names so you know how to specify the traits #in your model for the second argument below #note that you do not need to use all of the traits colnames(anthro$S) #2. model = the user specified model #WRITE YOUR OWN. Your.Model<-"" #std.lv = optional argument specifying whether variances of latent variables should be set to 1 std.lv=TRUE #Run your model YourResults <- usermodel(covstruc=covstruc, model=Your.Model,std.lv=std.lv) #look at your results; what do they tell you? YourResults$results #does it fit the data well? YourResults$modelfit