~~~~~~~~~~~~~~~~~~~~~~~~ Data and Formats

# Begin by looking at the ped file
# Use the -S flag to stop rows from wrapping

less -S cc.ped

# How is sex coded?
# How is the trait coded?
# Is it continuous or case-control?

# Look at the map file

less cc.map

# Which build are these data mapped to?

# Convert the ped & map file to plink formatted files
plink --ped cc.ped --map cc.map --make-bed --out cc.begin

# How many variants and people are there?
# What about this warning?
“Warning: 2177 het. haploid genotypes present (see cc.begin.hh )”
# Heterozygous haploid errors may be pseudo-autosomal region of chr X, can use split-x
# First we will check for sex errors in the data.


~~~~~~~~~~~~~~~~~~~~~~~~ Data Cleaning

# Check for a mismatch between sex reported in the ped file and sex from genotype data.

plink --bfile cc.begin --check-sex --out sex

# This will create a file called sex.sexcheck with the columns:
FID    IID       PEDSEX       SNPSEX       STATUS            F
WGACON   1         1            1           OK               1

# Look at the created file
less sex.sexcheck 

# Open the R script “plink-qc.R”  in Rstudio# We will use this script to plot the sex-check data# plot the sex heterozygosity as a histogram and as a boxplot


# Select the 3 individuals flagged as not a match between reported and genotyped sex
grep PROBLEM sex.sexcheck > sex.drop

# Remove those individuals
plink --bfile cc.begin --remove sex.drop --make-bed --out cc.qc1

# How many variants and people pass filters and QC?
# How many het haploid genotypes now?

# Use split-x to see if the remaining issues are due to the pseudo-autosomal regions
plink --bfile cc.qc1 --split-x b37 no-fail --make-bed --out cc.qc2

# This awk code will match on the chromosome from the bim file to the remaining het.haploid variants
awk 'NR==FNR{a[$2]=$1; next} $3 in a{print $0,a[$3]}' cc.begin.bim cc.qc2.hh > check.hh

# What chromosome(s) are they on?
# These remaining will be removed for analytical commands, so we will remove them here.

plink --bfile cc.qc2 --set-hh-missing --make-bed --out cc.qc3


~~~~~~~~~~~~~~~~~~~~~~~~ Obtain data for plotting heterozygosity by missing

# Obtain heterozygosity information (this is across chr 1-22)

plink --bfile cc.qc3 --het --out het

# This will create a file called het.het with the columns:
FID  IID       O(HOM)       E(HOM)        N(NM)            F

# Create file with the proportion of heterozygosity for each person.

echo "FID IID obs_HOM N_SNPs prop_HET" > het.txt 
awk 'NR>1{print $1,$2,$3,$5,($5-$3)/$5}' het.het >> het.txt

# Obtain missingness on individuals and on SNPs

plink --bfile cc.qc3 --missing --out miss

# This will create a file called miss.imiss with the columns:
FID  IID MISS_PHENO   N_MISS   N_GENO   F_MISS
WGACON    1          N     4634   484128 0.009572

# and one called miss.lmiss with the columns:
 CHR         SNP   N_MISS   N_GENO   F_MISS
  1   rs3094315        3      190  0.01579

# Create a file that holds both proportion of heterozygosity and individual missingness
awk 'NR==FNR{a[$1,$2]=$5;next}($1,$2) in a{print $1,$2,$6,a[$1,$2]}' het.txt miss.imiss > het.imiss.txt 

# This will create a file called het.imiss.txt with the columns:FID IID F_MISS prop_HETWGACON 1 0.009572 0.289072# Go back to “plink-qc.R” # Plot the heterozygosity histogram# Plot the sample call rate# How many individuals have more than 5% missing genotyped data? 
~~~~~~~~~~~~~~~~~~~~~~~~ Cleaning variants
# Still in “plink-qc.R” 
# Plot the histogram of genetic variants missing data


# Remove variants that are missing information from too many individuals
plink --bfile cc.qc3 --geno 0.05 --make-bed --out cc.qc4 

# How many SNPs were removed?

# When cleaning data collected from case-control, 
# check that variants are not missing disproportionately between cases and controls.

plink --bfile cc.qc4 --test-missing --out case-control 

# This awk code will copy any variants that differ in missingness with a p value < .00001 into a drop file
awk '$5<=0.00001' case-control.missing > case-control.missing.drop

# Obtain MAF information
plink --bfile cc.qc3  --freq --out maf

# In “plink-qc.R” 
# Plot the histogram of the MAF

# Remove variants with minor allele frequency (MAF) < .01 and Hardy Weinberg Equation (HWE) deviation < 1e-6
# The default HWE check on case-control data is to only conduct the check on the controls.
# We have overwritten that here with the [include-nonctrl] flag. 

plink --bfile cc.qc4 --maf 0.01 --hwe 1e-6 include-nonctrl --make-bed --out cc.qc5 

# How many SNPs were removed due to HWE < 1e-6? 
# How many SNPs were removed due to MAF < 0.01? # After QC on variants, how many variant and individuals remain?


~~~~~~~~~~~~~~~~~~~~~~~~ Cleaning individuals

# Remove individuals with missing information on more than 5% of the variants

plink --bfile cc.qc5 --mind 0.05 --make-bed --out cc.qc6 

# How many individuals were removed?

# Obtain the heterozygosity scores now that the data variant have been cleaned.

plink --bfile cc.qc6 --het --out het2

echo "FID IID obs_HOM N_SNPs prop_HET" > het2.txt 
awk 'NR>1{print $1,$2,$3,$5,($5-$3)/$5}' het2.het >> het2.txt

awk 'NR>1{sum+=$5;sq+=$5^2}END{avg=sum/(NR-1);print  avg-3*(sqrt(sq/(NR-2)-2*avg*(sum/(NR-2))+(((NR-1)*(avg^2))/(NR-2)))),avg+3*(sqrt(sq/(NR-2)-2*avg*(sum/(NR-2))+(((NR-1)*(avg^2))/(NR-2))))}' het2.txt

awk '$5<=0.29957 || $5>= 0.326526' het2.txt  > het.drop 


# Remove the 3 people outside 3SD from the mean heterozygosity rate
plink --bfile cc.qc6 --remove het.drop --make-bed --out cc.clean


~~~~~~~~~~~~~~~~~~~~~~~~ Check Relatedness

# First prune SNPs and select chr 1-22, this creates a file of SNPs pruned to be independent
plink --bfile cc.clean --chr 1-22 --indep-pairwise 1000 5 0.2 --out indep

# Calculate identity by descent (IBD) on those pruned SNPs
plink --bfile cc.clean --extract indep.prune.in --genome --out ibd

less ibd.genome

# Z0 Z1 Z2 = the pair of IDs share 0, 1 or 2 alleles by descent
# Ideally 1,0,0 = unrelated; 0,1,0 = parent-child; .25,.5,.25 = siblings; etc
# pihat = proportion IBD = P(IBD=2)+0.5*P(IBD=1) 

# Obtain a list of individuals related more than a pihat of .2
plink --bfile cc.clean --extract indep.prune.in --genome --min 0.2 --out pihat

# Check them in the file pihat.genome