#!/usr/bin/Rscript ############################################################################################ ### 1a: Genotype = two draws from a binomial distribution with an allele frequency rbinom(1,2,.5) ### 10 individuals, same allele frequency rbinom(10,2,.5) ### 10 different loci for a single individual, all different allele frequencies rbinom(10,2,seq(0,.9,by=0.1)) ### HWE expectations = p^2, 2pq, q^2. p<-0.5;q<-1-p hweprop = c(p^2, 2*p*q, q^2) 100*hweprop ### expected genotype counts for 100 individuals table(rbinom(100,2,p)) ### Randomly generated genotypes. Why aren't these exactly the same as the expected? What is the influence of increasing your sample size? ############################################################################################ ### 1b: Creating many genotypes from a range of allele frequencies and for many individuals N<-2000 ### Number of individuals M<-500 ### Number of loci to create genotypes for min_maf<-??? ### what minor allele frequency range do you want to simulate? maybe something like 0.05-0.45? max_maf<-??? af<-runif(M,min_maf,max_maf) ### Allele frequencies of a population. What MAF ranges do you want to include? Common, rare, mix, etc.? Is a uniform distribution reasonable? Something else? hist(af,breaks=50) ggen<-function(rep){rbinom(M,2,af)} ### Function to make genotypes for M loci from the allele frequencies gt<-sapply(rep(0,N),ggen) ### Creating the N individuals' genotypes af_samp<-apply(gt,1,sum)/(N*2) ### The allele frequencies in the SAMPLE hist(af_samp,breaks=50) plot(the true allele frequencies against the simulated allele frequencies in the sample) ### Why don't the allele frequencies in the simulated sample exactly equal the true (also simulated) population allele frequencies? ############################################################################################ ### 1c: Creating phenotypes from the above genotypes: ### Model is phenotype = genotype + environment (P=G+E) b<-rnorm(M,0,1) ### allele effect sizes, drawn from standard normal distribution. ### Simulating the phenotypes gi<-as.vector(??? %*% ???) ### Making the individuals' genotypic values (hint: multiply the betas by the genotypes [1,M]%*%[M,N] = [1,N]) Vg<-var(gi) hist(gi,breaks=50) ### Add in error - why do you need this? ei<-rnorm(N,0,sqrt(Vg)) #Note this should have h2=0.5. Why? Could vary this as well? Make a difference? Ve <-var(ei) hist(ei) ### Make phenotype: p <- ??? ### What do you include here to create the phenotype? hist(p) Vp<-var(p) ### How do you calculate the heritabiltity of your trait? What are the variances above to consider? h2<-??? / ???? ##### EXTRA THINGS TO THINK ABOUT ############################################################################################ ### 1d. Simulating two populations, and doing a simple association test: library(MASS) library(foreach) # setting up environment to use multiple cores library(doMC) registerDoMC(cores=4) Fst<-0.025 # Fst value to test M<-1000 # Polygenecity of the trait. The number of CVs n<-2000 # sample size per population. Note that total would be 2*n k<-0.5 # prevalence for binary phenotype tM<-10000 # Number of random test variants for GWAS ## allele frequencies in two populations (see Zaitlen et al. 2014 Nat. Gen. for an example using this kind of two-population simulation) afs<-runif(M,0.1,0.9) # Ancestral population AF af1<-rbeta(M,afs*(1-Fst)/Fst,(1-afs)*(1-Fst)/Fst) af2<-rbeta(M,afs*(1-Fst)/Fst,(1-afs)*(1-Fst)/Fst) ### effect sizes related to the average MAF between the two populations afm<-apply(cbind(af1,af2),1,mean) b<-rnorm(M,0,1/sqrt(2*afm*(1-afm))) ### Create phenotypes: ## for the two populations g1gen<-function(rep){rbinom(M,2,af1)} g2gen<-function(rep){rbinom(M,2,af2)} g<-cbind(sapply(rep(0,n),g1gen),sapply(rep(0,n),g2gen)) gp<-b%*%g p<-gp+rnorm(n*2,0,sd(gp)) #Note this should have h2=0.5. Could vary this as well? Make a difference? p<-scale(as.vector(p)) ## A single pop with ==MAFs & betas as pop 1 (so should be directly comparable and the Lambda denom) gs<-cbind(sapply(rep(0,n),g1gen),sapply(rep(0,n),g1gen)) gps<-b%*%gs ps<-gps+rnorm(n*2,0,sd(gps)) ### Make case/control phenotype with k prevalence bp<-rep(0,n*2) bp[which(p>quantile(p,1-k))]<-1 bps<-rep(0,n*2) bps[which(ps>quantile(ps,1-k))]<-1 #### Make totally unrelated genotypes to test for association tafs<-runif(tM,0.1,0.9) # Ancestral population AF taf1<-rbeta(tM,tafs*(1-Fst)/Fst,(1-tafs)*(1-Fst)/Fst) taf2<-rbeta(tM,tafs*(1-Fst)/Fst,(1-tafs)*(1-Fst)/Fst) tg1gen<-function(rep){rbinom(tM,2,taf1)} tg2gen<-function(rep){rbinom(tM,2,taf2)} tg<-cbind(sapply(rep(0,n),tg1gen),sapply(rep(0,n),tg2gen)) tgs<-cbind(sapply(rep(0,n),tg1gen),sapply(rep(0,n),tg1gen)) ### Testing each of the test markers with Armitage trend. fitted<-foreach(nn=1:tM,.combine=rbind)%dopar%{ ATmp<-n*2*cor(as.vector(bp),tg[nn,])^2 sATmp<-n*2*cor(as.vector(bps),tgs[nn,])^2 c(ATmp, 1-pchisq(ATmp,1), sATmp, 1-pchisq(sATmp,1)) } colnames(fitted)<-c("Test_stat_2populations","p_value_2pop","Test_stat_1populations","p_value_1pop") par(mfrow=c(2,1)) hist(fitted[,2],main="P-values with stratified sample") ### Try varying Fst and seeing how it impacts Type I error rate hist(fitted[,4],main="P-values with panmictic sample")