First login to RStudio
https://workshop-ood.colorado.edu/
Copy the Practical document into your home directory using the
following R commands.
setwd("~/.")
system("mkdir -p Day1")
system("cp /home/loic/2026/Practical_Pop_and_Quant_Gen.html ~/Day1/.")
system("cd ~/Day1")
setwd("Day1")
The R commands below define two functions: (1:
generateNextGeneration()) generates genotypes for the next
generation from a set of genotypes in the current population, and (2:
sim), which uses (1) to simulate the evolution over \(g\) generations of a population with a
constant sample size (\(N\)), a certain
number \(m\) of SNPs segregating in the
founding (or ancestral) population with a frequency (\(p\)). The sim function then
plot the frequency of each of the m alleles over the course
of g generations (each curve
corresponds to a SNP).
generateNextGeneration <- function(x,n){
m <- ncol(x)
N <- nrow(x)
iM <- sample(1:N,n,replace = TRUE)
iF <- sample(1:N,n,replace = TRUE)
xm <- x[iM,]
xf <- x[iF,]
xa <- t(sapply(1:n,function(k) rbinom(m,1,prob=0.5*xm[k,])))
xb <- t(sapply(1:n,function(k) rbinom(m,1,prob=0.5*xf[k,])))
xn <- xa + xb
return(xn)
}
sim <- function(N, # sample size
p, # allele frequency in the founding population
m, # number of marker
g){ # number of generations
## Simulate genotypes in the founding population
x <- do.call("cbind",lapply(1:m,function(j) rbinom(N,size=2,prob=p)))
alleleFreq <- matrix(NA,nrow=g+1,ncol=m)
alleleFreq[1,] <- colMeans(x)/2
for(gen in 1:g){
# cat(paste0("\tGen #",gen,"\n"))
x <- generateNextGeneration(x,N)
alleleFreq[gen+1,] <- colMeans(x)/2
}
par(mar=c(5,5,3,2))
matplot(1:(1+g),alleleFreq,type="l",lty=1,axes=FALSE,
main=paste0("Size of the population: N = ",N),
xlab="Number of generations",ylab="Allele frequencies",
col=sample(colors(),m),log="x",cex.lab=1.2)
axis(1);axis(2)
abline(h=p,col="grey")
return(alleleFreq)
}
Execute these two functions in your R terminal
then run the following command.
t1 <- system.time( m50 <- sim(N=50 ,p=0.05,m=20,g=1000) )
Question 1a. What is the allele frequency after 1000 generations?
Question 1b. Do you see the same patterns if you run this command multiple times?
Question 1c. Try different sizes for the ancestral population (e.g., \(N=100\), or \(N=200\)). How does it affect the frequency trajectories?
Question 1d. Try different allele frequency in the ancestral population (e.g., \(p=0.1\), or \(p=0.01\)). How does it affect the frequency trajectories?
Question 1e. Run the following command. What can you say about the effect of the ancestral population size on the probability of fixation?
t1 <- system.time( m50 <- sim(N=50 ,p=0.5,m=20,g=1000) )
t2 <- system.time( m100 <- sim(N=100 ,p=0.5,m=20,g=1000) )
t3 <- system.time( m200 <- sim(N=200 ,p=0.5,m=20,g=1000) )
The objects m50, m100 and m200
contains the frequency trajectories of three simulated populations with
sizes 50, 100 and 200 respectively. The following R
commands count and visualize the proportion of fixed alleles over 1000
generations.
Run the following command. What can you say about the effect of sample size on the probability of fixation?
f50 <- apply(m50*(1-m50)==0 ,1,mean)
f100 <- apply(m100*(1-m100)==0,1,mean)
f200 <- apply(m200*(1-m200)==0,1,mean)
par(mar=c(5,5,3,2))
matplot(cbind(f50,f100,f200),type="l",lwd=2,lty=1,
xlab="Number of generations",axes=FALSE,cex.lab=2,
ylab="Fraction of fixed allele frequencies")
axis(1);axis(2)
legend(800,0.25,legend=c("N=50","N=100","N=200"),
box.lty=0,fill=1:3,horiz=FALSE,border=0,cex=1)
The R commands below define a function called
FstSim, which tracks \(F_{ST}(t)\) between the ancestral
population and the populuation at time \(t\). This functions reuses the
generateNextGeneration function defined above. Input
parameters for that function are size of the ancestral population (\(N\)), the allele frequency (\(p\)) in the ancestral population, the
number \(m\) of SNPs segregating in the
ancestral population, the number \(g\)
of generations, the growth rate \(r\)
(default \(r=0\), i.e. the population
does not grow over time), and \(k\) an
index for the replicate.
Run the following command to define the function.
FstSim <- function(N=100, # sample size
p=0.5, # allele frequency in the ancestral population
m=100, # number of markers
g=500, # number of generations
r=0.0, # growth rate (r=0, no growth; r=0.01: 1% growth per generation)
k=0, # replicate index
plotIt=TRUE){
## Simulate base population
x <- do.call("cbind",lapply(1:m,function(j) rbinom(N,size=2,prob=p)))
alleleFreq <- matrix(NA,nrow=g+1,ncol=m)
alleleFreq[1,] <- colMeans(x)/2
n <- N
for(gen in 1:g){
n <- round(n*(1+r))
x <- generateNextGeneration(x,n)
alleleFreq[gen+1,] <- colMeans(x)/2
}
cat(paste0("\tReplicate #",k,"\n"))
fst <- apply(alleleFreq,1,function(x) mean((x-p)^2) / (p*(1-p)) )
return(fst)
}
Question 2a. Under the Wright-Fisher model, we can predict that \(F_{ST}(t) \approx 1-e^{-t/(2N)}\). Run the following commands to verify how well this prediction works. If \(N=100\), then how many years do we have to wait for \(F_{ST}(t)\) to reach 0.1 ? What if \(N=10,000\)? (We assume that 1 generation \(\approx\) 25 years).
r <- 0.0 # growth rate of population size
m <- 100 # number of SNPs
p0 <- 0.5 # allele frequency in the ancestral population
nG <- 500 # number of Generations
N <- 100 # size of the ancestral population
dt <- 0:nG # Time (in number of generations)
# Run simulation
Fst <- FstSim(N=N,p=p0,m=m,g=nG,r=r,k=0)
plot(dt,Fst,pch=19,cex=0.5,ylim=c(0,1),cex.lab=1.0,
xlab="Number of generations",ylab="Fst between Current and Ancestral population",
axes=FALSE);axis(1);axis(2)
ExpectedFst <- sapply(dt,function(t) 1-exp(-t/(2*N)))
lines(dt,ExpectedFst,col=2,lwd=2)
legend(0.5*nG,0.2,legend=c("Theoretical Expectation"),col=c(2,4),box.lty=0,lwd=2)
Question 2b. Change the growth rate of the population to \(r=0.01\) (1%). What do you see? [WARNING: Setting growth rate beyond 1% will lead to extremely slow runs].
Question 2c. Change the growth rate of the population to \(r=-0.05\) (-5%). What do you see?
We provide a R script named
simulate-data-practical.R, which will generate the data
(PLINK files: mydata.ped and mydata.map; two
phenotypes in mydata.phen) needed for this practical.
Copy the script to generate the data (the “.” at the end of the command is mandatory to specify that the files are to be copied in the current folder)
system("cd ~/Day1")
system("cp /home/loic/2026/simulate-data-practical.R ~/Day1/.")
Generate the data using the following Rscript command
(takes ~1 minute). Don’t forget to copy
the “.” at the end of the command.
system("Rscript simulate-data-practical.R .")
Check that the data have been correctly generated
system("ls -l")
Convert PLINK files from *.ped format to
*.bed format
system("plink --file mydata --make-bed --out mydata")
Exercise 1: Calculate the Genetic Relationship Matrix (GRM) using all SNPs
system("gcta64 --bfile mydata --make-grm-bin --out mydata_allSNPs")
Questions:
1) How many individuals/SNPs do you have in the sample?
2) What is the mean of the diagonal elements of the GRM? Is that expected?
3) What is the variance of the off-diagonal elements? The inverse of that variance gives an estimate of the effective number of independent markers. Calculate that effective number of markers. Why can it be different from the actual number of SNPs used to build the GRM?
Exercise 2: Calculate GRM using SNPs with MAF>0.05 and MAF<0.05
system("gcta64 --bfile mydata --maf 0.05 --make-grm-bin --out mydata_maf_above_0.05")
system("gcta64 --bfile mydata --max-maf 0.05 --make-grm-bin --out mydata_maf_below_0.05")
Questions:
1) How many SNPs have a MAF<0.05?
2) Why is the variance of diagonal elements larger in the GRM based on SNPs with MAF<0.05?
Some more data processing…
## Creating mgrm.txt file (with the "prefix" of the MAF stratified GRMs)
system("echo Creating mgrm.txt file")
system("echo mydata_maf_below_0.05 > mgrm.txt")
system("echo mydata_maf_above_0.05 >> mgrm.txt")
[Note: in general it is preferable to use the full path for defining
the mgrm.txt file]
Exercise 3: Identify (genetically) unrelated individuals
system("gcta64 --grm mydata_allSNPs --grm-singleton 0.05 --out unrelated")
Question: How many individuals are unrelated in this sample? GRM cut-off 0.05? cut-off 0.025?
Exercise 4: GREML estimation of heritability with and without relatives
system("gcta64 --grm mydata_allSNPs --pheno mydata.phen --mpheno 1 --reml --out trait1_with_relatives")
system("gcta64 --grm mydata_allSNPs --pheno mydata.phen --mpheno 1 --reml --out trait1_without_relatives --grm-cutoff 0.05")
Questions: Compare heritability estimates from these two analyses. What could explain the difference that you observe?
Exercise 5: MAF-stratified heritability estimation (Note: this is using a different phenotype than in Exercise 4)
system("gcta64 --grm mydata_allSNPs --pheno mydata.phen --mpheno 2 --reml --keep unrelated.singleton.txt --out trait2_allSNPs")
system("gcta64 --mgrm mgrm.txt --pheno mydata.phen --mpheno 2 --reml --keep unrelated.singleton.txt --out trait2_maf_stratified")
Questions: Compare heritability estimates from these two analyses
1) What could explain the difference that you observe? 2) Which one of these estimates should you trust more? How can you be sure?
In the first part of the practical we used Identity-By-State (IBS) between (conventionally) unrelated individuals to estimate heritability. Here, we will estimate heritability by leveraging the fact that within-family segregation generates an observable variation of realized relatedness (i.e., proportion of the genome shared Identical By Descent - IBD) around their expected value given the pedigree relationship.
We provide a dataset named sib.RData, which contains
100,000 sibling pairs. The phenotype of siblings are denoted
PhenoSib1 and PhenoSib2 respectively. We also
provide the proportion of their genome that is IBD.
Load the data in R using the following command
load("/home/loic/2026/sib.RData")
The command above will load an R data.frame object named
sibdt. Here is an overview the data
head(sibdt,4)
Question 1. Calculate the mean and standard deviation of the IBD proportion (10th column). Are these results (at least the mean) expected?
Question 2. Calculate the phenotypic correlation between siblings. Under what assumption(s) is twice that correlation an unbiased estimator of the narrow sense heritability (Hint: You may refer to Practical 2 to answer the second part of the question)?
We now use Haseman-Elston (HE) regression to estimate heritability. Run the following command
summary( lm(I(PhenoSib1 * PhenoSib2) ~ IBD, data=sibdt) )$coefficients
Question 3. What is the heritability of the trait of interest? Is the result different from that obtained for Question 2? What could be the interpretation of the intercept?
We now select the first 5000 pairs to estimate heritability using the Restricted Maximum Likelihood (REML) implemented in GCTA. We only use 5000 pairs during the practical to reduce computational time. Analyzing 100,000 pairs would take days of calculations.
First run the following R command to convert the data
into GCTA’s GRM format.
prefix <- "sibdt5k"
n <- 5000
N <- 2*n
G <- diag(N)
iid <- do.call("c",lapply(1:n,function(i){
as.character( sibdt[i,c("IID1","IID2")])
} ) )
phn <- do.call("c",lapply(1:n,function(i) {
as.numeric( sibdt[i,c("PhenoSib1","PhenoSib2")])}
) )
fid <- rep(1:n,each=2)
pheno10ksib <- cbind.data.frame(fid,iid,phn)
write.table(pheno10ksib,paste0(prefix,".phen"),quote=F,row.names = F,col.names = F)
for(i in 1:n){
G[2*(i-1)+1,2*i] <- G[2*i,2*(i-1)+1] <- sibdt[i,"IBD"]
}
size <- 4 #defines how many bytes of memory will be used to represent GRM values
collapsed.grm <- G[upper.tri(G,diag = TRUE)] #Create a vector of GRM values
write.table(cbind(fid,iid),paste0(prefix,".grm.id"),quote=F,row.names=F,col.names = F)
BinFileName <- paste0(prefix,".grm.bin") #Name the GRM
BinFile <- file(BinFileName, "wb") #Create the file
writeBin(con = BinFile, collapsed.grm, size = size) #Write the file, size = 4
close(BinFile)
Now run GCTA with the --reml-wfam option,
which performs REML assuming (in this case knowing) that all families
are independent. This is much faster than using the standard
--reml flag. For simplicity, we will call GCTA
from R using the system() function. Type the
following command in your R console.
Output from GCTA will simply be displayed in the console but actual results are still stored in the separate files.
system("gcta64 --grm sibdt5k --pheno sibdt5k.phen --reml-wfam --out sibdt5k")
Question 4. What is the REML heritability estimates and its standard error? Compare your results with those from HE regression above and from HE regression using the first 5000 pairs. For the same sample size (n=5000 sibling pairs), how much larger are standard errors from HE regression relative to that of REML?
summary( lm(I(PhenoSib1 * PhenoSib2) ~ IBD, data=sibdt[1:5000,]) )$coefficients