#!/usr/bin/Rscript

############################################################################################
### 3: Making phenotypes from a simple polygenic model for the N individuals from M loci
### This is the fundamental model: phenotype = genotype + environment, or y=g+e
N<-2000 ### Number of individuals
M<-500 ### Number of loci to create genotypes for
af<-runif(M,0.01,0.4) ### Allele frequencies of a population.
ggen<-function(rep){rbinom(M,2,af)} ### Function to make genotypes for M loci from the allele frequencies
gt<-sapply(rep(0,N),ggen) ### Creating the N individuals' genotypes
af_samp<-apply(gt,1,sum)/(N*2) ### The allele frequencies in the SAMPLE

### first make allelic effects: 1XM matrix
b_scale<-matrix(rnorm(M,0,1/sqrt(2*af*(1-af))),1,M)  #Effect size scaled to allele frequency. Why use afs rather than af_samp?
b<-rnorm(M,0,1) ### Or use a standard normal for the allelic effects
par(mfrow=c(3,2),mar=c(3.5,3.5,1,1),mgp=c(2,1,0))
hist(af);plot(af,af_samp);hist(b);plot(af,b);hist(b_scale);plot(af,b_scale)

### Simulating the phenotypes
gi<-as.vector(b_scale%*%gt) ### Making the individuals' genotypic values
Vg<-var(gi)
par(mfrow=c(3,1),mar=c(3.5,3.5,1,1),mgp=c(2,1,0))
hist(gi,breaks=50)

### Add in error
ei<-rnorm(N,0,sqrt(Vg)) #Note this should have h2=0.5. Could vary this as well? Make a difference?
Ve <-var(ei)
hist(ei)

### Make phenotype:
p <- gi + ei
hist(p)
Vp<-var(as.vector(p))


############################################################################################
### 3b: Making new effect sizes for the same loci you randomly chose before, using the MAF-effect size relationship and then GCTA to actually simulate
m<-read.table("simulation_2/pheno.common.rep1.txt.par",header=T)
head(m)
b<-rnorm(nrow(m),0,1/sqrt(2*m$Frequency*(1-m$Frequency))) ### Making new effect sizes
plot(m$Frequency,b) ### What is the relationship between MAF & effect size?
new_b<-cbind(m$QTL,b) ### Our new list of causal loci with the MAF-scaled effect sizes
write.table(new_b,"simulation_3/common.rep1.MAFscaled.txt",row.names=F,col.names=F,sep="\t",quote=F)  #this is the new phenotype, which you can run and compare to your old simulations (without MAF-scaled effect sizes)

### How would you run this across each of your randomly-generated sets of causal loci?
### Now complete making the phenotypes & running some example analyses with the steps in simulation_3.bash 


############################################################################################
### 3c: Plotting h2 estimates (do *****AFTER****** you've run the simulation_3.bash exercise)
h2<-read.table("simulation_3/previously_completed/all_hsq_estimates.txt",header=T)
h2n<-h2[which(h2$betas=="normal"),]
boxplot(h2n$h2~h2n$CV_MAF,main="",ylim=c(0,1),ylab="SNP-heritability")
### How would you compare all the simulations? What are the tests to run?


##### EXTRA THINGS TO THINK ABOUT
############################################################################################
### 3d. Adjusting heritability: altering the impact of genetic and environmental variance
### Could normalize first, which makes the math a bit easier to think about heritability:
### Ve = Vg/h2 - Vg or (1-h2)*Vg/h2
h2<-.8 ### Narrow-sense heritability to simulate

ngi<-(gi-mean(gi))/sd(gi)
ngi_scale<-scale(gi) #Built-in function to normalize the data
plot(ngi,ngi_scale)
Vgn<-var(ngi)
print(Vgn)

nei<-rnorm(N,0,sqrt((1-h2)*Vgn/h2))
Ven<-var(nei)

pn<-ngi+nei
hist(pn)
var(pn)==Vgn+Ven+2*cov(ngi,nei)

h2_hat<-Vgn/(var(pn))  #### The heritability from the simulated sample. Why doesn't this equal h2=0.8 exactly?
h2_hat


############################################################################################
###### 3e. Very simple association analysis for binary trait - vary the betas, MAFs, etc. & see what it does
k<-.3 ### Population prevalence of a disease
bp<-rep(0,N)
bp[which(pn>quantile(pn,1-k))]<-1   ### Assuming that the sample is a random sample wrt disease
sum(bp)/N

library(foreach) # setting up environment to use multiple cores if possible
library(doMC)
registerDoMC(cores=4)
fitted<-foreach(nn=1:M,.combine=rbind)%dopar%{
			Atmp<-N*cor(bp,gt[nn,])^2 #### Armitage trend test
			c(Atmp,1-pchisq(Atmp,1))

			}
			
### Otherwise:
fitted<-NULL
for(nn in 1:M){
	Atmp<-N*cor(bp,gt[nn,])^2 #### Armitage trend test
	fitted<-rbind(fitted,c(Atmp,1-pchisq(Atmp,1)))
}


par(mfrow=c(3,1),mar=c(3.25,3.25,1,1),mgp=c(2,1,0))
plot(fitted[,1],ylab="association statistic (X2)",xlab="Locus index")
plot(-log10(fitted[,2]),ylab = "-log10 (p-value)",xlab="Locus index")
plot(b,-log10(fitted[,2]),ylab = "-log10 (p-value)",xlab="effect size b")