#!/usr/bin/Rscript ############################################################################################ ### 3: Making phenotypes from a simple polygenic model for the N individuals from M loci ### This is the fundamental model: phenotype = genotype + environment, or y=g+e N<-2000 ### Number of individuals M<-500 ### Number of loci to create genotypes for af<-runif(M,0.01,0.4) ### Allele frequencies of a population. ggen<-function(rep){rbinom(M,2,af)} ### Function to make genotypes for M loci from the allele frequencies gt<-sapply(rep(0,N),ggen) ### Creating the N individuals' genotypes af_samp<-apply(gt,1,sum)/(N*2) ### The allele frequencies in the SAMPLE ### first make allelic effects: 1XM matrix b_scale<-matrix(rnorm(M,0,1/sqrt(2*af*(1-af))),1,M) #Effect size scaled to allele frequency. Why use afs rather than af_samp? b<-rnorm(M,0,1) ### Or use a standard normal for the allelic effects par(mfrow=c(3,2),mar=c(3.5,3.5,1,1),mgp=c(2,1,0)) hist(af);plot(af,af_samp);hist(b);plot(af,b);hist(b_scale);plot(af,b_scale) ### Simulating the phenotypes gi<-as.vector(b_scale%*%gt) ### Making the individuals' genotypic values Vg<-var(gi) par(mfrow=c(3,1),mar=c(3.5,3.5,1,1),mgp=c(2,1,0)) hist(gi,breaks=50) ### Add in error ei<-rnorm(N,0,sqrt(Vg)) #Note this should have h2=0.5. Could vary this as well? Make a difference? Ve <-var(ei) hist(ei) ### Make phenotype: p <- gi + ei hist(p) Vp<-var(as.vector(p)) ############################################################################################ ### 3b: Making new effect sizes for the same loci you randomly chose before, using the MAF-effect size relationship and then GCTA to actually simulate m<-read.table("simulation_2/pheno.common.rep1.txt.par",header=T) head(m) b<-rnorm(nrow(m),0,1/sqrt(2*m$Frequency*(1-m$Frequency))) ### Making new effect sizes plot(m$Frequency,b) ### What is the relationship between MAF & effect size? new_b<-cbind(m$QTL,b) ### Our new list of causal loci with the MAF-scaled effect sizes write.table(new_b,"simulation_3/common.rep1.MAFscaled.txt",row.names=F,col.names=F,sep="\t",quote=F) #this is the new phenotype, which you can run and compare to your old simulations (without MAF-scaled effect sizes) ### How would you run this across each of your randomly-generated sets of causal loci? ### Now complete making the phenotypes & running some example analyses with the steps in simulation_3.bash ############################################################################################ ### 3c: Plotting h2 estimates (do *****AFTER****** you've run the simulation_3.bash exercise) h2<-read.table("simulation_3/previously_completed/all_hsq_estimates.txt",header=T) h2n<-h2[which(h2$betas=="normal"),] boxplot(h2n$h2~h2n$CV_MAF,main="",ylim=c(0,1),ylab="SNP-heritability") ### How would you compare all the simulations? What are the tests to run? ##### EXTRA THINGS TO THINK ABOUT ############################################################################################ ### 3d. Adjusting heritability: altering the impact of genetic and environmental variance ### Could normalize first, which makes the math a bit easier to think about heritability: ### Ve = Vg/h2 - Vg or (1-h2)*Vg/h2 h2<-.8 ### Narrow-sense heritability to simulate ngi<-(gi-mean(gi))/sd(gi) ngi_scale<-scale(gi) #Built-in function to normalize the data plot(ngi,ngi_scale) Vgn<-var(ngi) print(Vgn) nei<-rnorm(N,0,sqrt((1-h2)*Vgn/h2)) Ven<-var(nei) pn<-ngi+nei hist(pn) var(pn)==Vgn+Ven+2*cov(ngi,nei) h2_hat<-Vgn/(var(pn)) #### The heritability from the simulated sample. Why doesn't this equal h2=0.8 exactly? h2_hat ############################################################################################ ###### 3e. Very simple association analysis for binary trait - vary the betas, MAFs, etc. & see what it does k<-.3 ### Population prevalence of a disease bp<-rep(0,N) bp[which(pn>quantile(pn,1-k))]<-1 ### Assuming that the sample is a random sample wrt disease sum(bp)/N library(foreach) # setting up environment to use multiple cores if possible library(doMC) registerDoMC(cores=4) fitted<-foreach(nn=1:M,.combine=rbind)%dopar%{ Atmp<-N*cor(bp,gt[nn,])^2 #### Armitage trend test c(Atmp,1-pchisq(Atmp,1)) } ### Otherwise: fitted<-NULL for(nn in 1:M){ Atmp<-N*cor(bp,gt[nn,])^2 #### Armitage trend test fitted<-rbind(fitted,c(Atmp,1-pchisq(Atmp,1))) } par(mfrow=c(3,1),mar=c(3.25,3.25,1,1),mgp=c(2,1,0)) plot(fitted[,1],ylab="association statistic (X2)",xlab="Locus index") plot(-log10(fitted[,2]),ylab = "-log10 (p-value)",xlab="Locus index") plot(b,-log10(fitted[,2]),ylab = "-log10 (p-value)",xlab="effect size b")