#!/usr/bin/Rscript


############################################################################################
### Genotype = two draws from a binomial distribution with an allele frequency

rbinom(1,1,.5)

### 10 individuals, same allele frequency 
rbinom(10,2,.5)

### HWE expectations = p^2, 2pq, q^2
hweprop = c(0.5^2, 2*.5*.5,0.5^2)
100*hweprop
table(rbinom(100,2,.5)) 


############################################################################################
### Creating many genotypes from a range of allele frequencies and for many individuals


N<-2000 ### Number of individuals
M<-500 ### Number of loci to create genotypes for


af<-runif(M,0.05,0.45) ### Allele frequencies of a population
hist(af,breaks=50)


ggen<-function(rep){rbinom(M,2,af)} ### Function to make genotypes for individuals from the allele frequency


gt<-sapply(rep(0,N),ggen) ### Creating the individual genotypes

af_samp<-apply(gt,1,sum)/(N*2) ### The allele frequencies in the SAMPLE
hist(af_samp,breaks=20)

plot(af,af_samp) ### Why don't the allele frequencies in the simulated sample equal the true (also simulated) population allele frequencies?


############################################################################################
### Making phenotypes from a simple polygenic model for the N individuals from M loci

### first make allelic effects: 1XM matrix
b<-matrix(rnorm(M,0,1/sqrt(2*af*(1-af))),1,M)  #why use afs rather than p? Impact of the assumed variance of allelic effects?
#b<-rnorm(M,0,1) ### use a different variance for the allelic effects
par(mfrow=c(3,1),mar=c(3.5,3.5,1,1),mgp=c(2,1,0))
hist(af);hist(b);plot(af,b)

### What is the impact of the different assumptions of variance of allelic effects? (see Falconer & Mackay)
h2_persnp_1<-2*af*(1-af)*rnorm(M,0,1)^2
h2_persnp_scaled<-2*af*(1-af)*rnorm(M,0,1/sqrt(2*af*(1-af)))^2
par(mfrow=c(2,1),mar=c(3.25,3.25,1,1),mgp=c(2,1,0))
plot(af,h2_persnp_1);plot(af,h2_persnp_scaled)


### Simulating the phenotypes
gi<-as.vector(b%*%gt) ### Making the individuals' genotypic values
Vg<-var(gi)
hist(gi,breaks=50)

### Add in error
ei<-rnorm(N,0,sqrt(Vg)) #Note this should have h2=0.5. Could vary this as well? Make a difference?
Ve <-var(ei)
hist(ei)

### Make phenotype:
p <- gi + ei
hist(p)
Vp<-var(as.vector(p))


############################################################################################
### Adjusting heritability: altering the impact of genetic and environmental variance
### Could normalize first, which makes the math a bit easier to think about heritability:
h2<-.8 ### Narrow-sense heritability to simulate

ngi<-(gi-mean(gi))/sd(gi)
ngi_scale<-scale(gi) #Built-in function to normalize the dat
plot(ngi,ngi_scale)
Vgn<-var(ngi)
print(Vgn)

nei<-rnorm(N,0,sqrt((1-h2)*Vgn/h2))
Ven<-var(nei)

pn<-ngi+nei
hist(pn)
var(pn)==Vgn+Ven+2*cov(ngi,nei)

h2_hat<-Vgn/(var(pn))  #### The heritability from the simulated sample. Why doesn't this equal h2=0.8 exactly?
h2_hat


############################################################################################
###### Very simple association analysis for binary trait
k<-.3 ### Population prevalence
bp<-rep(0,N)
bp[which(pn>quantile(pn,1-k))]<-1   ### Assuming that the sample is a random sample wrt disease
sum(bp)/N

library(foreach) # setting up environment to use multiple cores if possible
library(doMC)
registerDoMC(cores=4)
fitted<-foreach(nn=1:M,.combine=rbind)%dopar%{
			Atmp<-N*cor(bp,gt[nn,])^2 #### Armitage trend test
			c(Atmp,1-pchisq(Atmp,1))

			}
			
### Otherwise:
fitted<-NULL
for(nn in 1:M){
	Atmp<-N*cor(bp,gt[nn,])^2 #### Armitage trend test
	fitted<-rbind(fitted,c(Atmp,1-pchisq(Atmp,1)))
}


par(mfrow=c(3,1),mar=c(3.25,3.25,1,1),mgp=c(2,1,0))
plot(fitted[,1],ylab="association statistic (X2)",xlab="Locus index")
plot(-log10(fitted[,2]),ylab = "-log10 (p-value)",xlab="Locus index")
plot(fitted[,1],-log10(fitted[,2]),ylab = "-log10 (p-value)",xlab="association statistic (X2)")