#!/usr/bin/Rscript


############################################################################################
### 1a: Genotype = two draws from a binomial distribution with an allele frequency

rbinom(1,2,.5)

### 10 individuals, same allele frequency 
rbinom(10,2,.5)

### 10 different loci for a single individual, all different allele frequencies
rbinom(10,2,seq(0,.9,by=0.1))

### HWE expectations = p^2, 2pq, q^2.
p<-0.5;q<-1-p
hweprop = c(p^2, 2*p*q, q^2)
100*hweprop ### expected genotype counts for 100 individuals
table(rbinom(100,2,p)) ### Randomly generated genotypes. Why aren't these exactly the same as the expected? What is the influence of increasing your sample size?


############################################################################################
### 1b: Creating many genotypes from a range of allele frequencies and for many individuals
N<-2000 ### Number of individuals
M<-500 ### Number of loci to create genotypes for
min_maf<-??? ### what minor allele frequency range do you want to simulate? maybe something like 0.05-0.45?
max_maf<-??? 

af<-runif(M,min_maf,max_maf) ### Allele frequencies of a population. What MAF ranges do you want to include? Common, rare, mix, etc.? Is a uniform distribution reasonable? Something else?
hist(af,breaks=50)

ggen<-function(rep){rbinom(M,2,af)} ### Function to make genotypes for M loci from the allele frequencies

gt<-sapply(rep(0,N),ggen) ### Creating the N individuals' genotypes

af_samp<-apply(gt,1,sum)/(N*2) ### The allele frequencies in the SAMPLE
hist(af_samp,breaks=50)

plot(the true allele frequencies against the simulated allele frequencies in the sample) ### Why don't the allele frequencies in the simulated sample exactly equal the true (also simulated) population allele frequencies?


############################################################################################
### 1c: Creating phenotypes from the above genotypes:
### Model is phenotype = genotype + environment (P=G+E)
b<-rnorm(M,0,1) ### allele effect sizes, drawn from standard normal distribution. 
### Simulating the phenotypes
gi<-as.vector(??? %*% ???) ### Making the individuals' genotypic values (hint: multiply the betas by the genotypes [1,M]%*%[M,N] = [1,N])
Vg<-var(gi)
hist(gi,breaks=50)

### Add in error - why do you need this? 
ei<-rnorm(N,0,sqrt(Vg)) #Note this should have h2=0.5. Why? Could vary this as well? Make a difference?
Ve <-var(ei)
hist(ei)

### Make phenotype:
p <- ??? ### What do you include here to create the phenotype?
hist(p)
Vp<-var(p)

### How do you calculate the heritabiltity of your trait? What are the variances above to consider?
h2<-??? / ????


##### EXTRA THINGS TO THINK ABOUT
############################################################################################
### 1d. Simulating two populations, and doing a simple association test:
library(MASS)
library(foreach) # setting up environment to use multiple cores
library(doMC)
registerDoMC(cores=4)


Fst<-0.025 # Fst value to test
M<-1000 # Polygenecity of the trait. The number of CVs
n<-2000 # sample size per population. Note that total would be 2*n
k<-0.5 # prevalence for binary phenotype
tM<-10000 # Number of random test variants for GWAS


		## allele frequencies in two populations (see Zaitlen et al. 2014 Nat. Gen. for an example using this kind of two-population simulation)
		afs<-runif(M,0.1,0.9) # Ancestral population AF
		af1<-rbeta(M,afs*(1-Fst)/Fst,(1-afs)*(1-Fst)/Fst)
		af2<-rbeta(M,afs*(1-Fst)/Fst,(1-afs)*(1-Fst)/Fst)
		
		### effect sizes related to the average MAF between the two populations
		afm<-apply(cbind(af1,af2),1,mean)
		b<-rnorm(M,0,1/sqrt(2*afm*(1-afm)))

		### Create phenotypes:
		## for the two populations
		g1gen<-function(rep){rbinom(M,2,af1)}
		g2gen<-function(rep){rbinom(M,2,af2)}
		g<-cbind(sapply(rep(0,n),g1gen),sapply(rep(0,n),g2gen))
		gp<-b%*%g
		p<-gp+rnorm(n*2,0,sd(gp)) #Note this should have h2=0.5. Could vary this as well? Make a difference?
		p<-scale(as.vector(p))

		## A single pop with ==MAFs & betas as pop 1 (so should be directly comparable and the Lambda denom)
		gs<-cbind(sapply(rep(0,n),g1gen),sapply(rep(0,n),g1gen))
		gps<-b%*%gs
		ps<-gps+rnorm(n*2,0,sd(gps))

		### Make case/control phenotype with k prevalence
		bp<-rep(0,n*2)
		bp[which(p>quantile(p,1-k))]<-1
		bps<-rep(0,n*2)
		bps[which(ps>quantile(ps,1-k))]<-1

		#### Make totally unrelated genotypes to test for association
		tafs<-runif(tM,0.1,0.9) # Ancestral population AF
		taf1<-rbeta(tM,tafs*(1-Fst)/Fst,(1-tafs)*(1-Fst)/Fst)
		taf2<-rbeta(tM,tafs*(1-Fst)/Fst,(1-tafs)*(1-Fst)/Fst)

		tg1gen<-function(rep){rbinom(tM,2,taf1)}
		tg2gen<-function(rep){rbinom(tM,2,taf2)}
		tg<-cbind(sapply(rep(0,n),tg1gen),sapply(rep(0,n),tg2gen))
		tgs<-cbind(sapply(rep(0,n),tg1gen),sapply(rep(0,n),tg1gen))

		### Testing each of the test markers with Armitage trend.
		fitted<-foreach(nn=1:tM,.combine=rbind)%dopar%{
			ATmp<-n*2*cor(as.vector(bp),tg[nn,])^2
			sATmp<-n*2*cor(as.vector(bps),tgs[nn,])^2
			c(ATmp, 1-pchisq(ATmp,1), sATmp, 1-pchisq(sATmp,1))
		}


colnames(fitted)<-c("Test_stat_2populations","p_value_2pop","Test_stat_1populations","p_value_1pop")
par(mfrow=c(2,1))
hist(fitted[,2],main="P-values with stratified sample") ### Try varying Fst and seeing how it impacts Type I error rate
hist(fitted[,4],main="P-values with panmictic sample")