~~~~~~~~~~~~~~~~~~~~~~~~ Data and Formats # Begin by looking at the ped file # Use the -S flag to stop rows from wrapping less -S cc.ped # How is sex coded? # How is the trait coded? # Is it continuous or case-control? # Look at the map file less cc.map # Which build are these data mapped to? # Convert the ped & map file to plink formatted files plink --ped cc.ped --map cc.map --make-bed --out cc.begin # How many variants and people are there? # What about this warning? “Warning: 2177 het. haploid genotypes present (see cc.begin.hh )” # Heterozygous haploid errors may be pseudo-autosomal region of chr X, can use split-x # First we will check for sex errors in the data. ~~~~~~~~~~~~~~~~~~~~~~~~ Data Cleaning # Check for a mismatch between sex reported in the ped file and sex from genotype data. plink --bfile cc.begin --check-sex --out sex # This will create a file called sex.sexcheck with the columns: FID IID PEDSEX SNPSEX STATUS F WGACON 1 1 1 OK 1 # Look at the created file less sex.sexcheck # Open the R script “plink-qc.R” in Rstudio # We will use this script to plot the sex-check data # plot the sex heterozygosity as a histogram and as a boxplot # Select the 3 individuals flagged as not a match between reported and genotyped sex grep PROBLEM sex.sexcheck > sex.drop # Remove those individuals plink --bfile cc.begin --remove sex.drop --make-bed --out cc.qc1 # How many variants and people pass filters and QC? # How many het haploid genotypes now? # Use split-x to see if the remaining issues are due to the pseudo-autosomal regions plink --bfile cc.qc1 --split-x b37 no-fail --make-bed --out cc.qc2 # This awk code will match on the chromosome from the bim file to the remaining het.haploid variants awk 'NR==FNR{a[$2]=$1; next} $3 in a{print $0,a[$3]}' cc.begin.bim cc.qc2.hh > check.hh # What chromosome(s) are they on? # These remaining will be removed for analytical commands, so we will remove them here. plink --bfile cc.qc2 --set-hh-missing --make-bed --out cc.qc3 ~~~~~~~~~~~~~~~~~~~~~~~~ Obtain data for plotting heterozygosity by missing # Obtain heterozygosity information (this is across chr 1-22) plink --bfile cc.qc3 --het --out het # This will create a file called het.het with the columns: FID IID O(HOM) E(HOM) N(NM) F # Create file with the proportion of heterozygosity for each person. echo "FID IID obs_HOM N_SNPs prop_HET" > het.txt awk 'NR>1{print $1,$2,$3,$5,($5-$3)/$5}' het.het >> het.txt # Obtain missingness on individuals and on SNPs plink --bfile cc.qc3 --missing --out miss # This will create a file called miss.imiss with the columns: FID IID MISS_PHENO N_MISS N_GENO F_MISS WGACON 1 N 4634 484128 0.009572 # and one called miss.lmiss with the columns: CHR SNP N_MISS N_GENO F_MISS 1 rs3094315 3 190 0.01579 # Create a file that holds both proportion of heterozygosity and individual missingness awk 'NR==FNR{a[$1,$2]=$5;next}($1,$2) in a{print $1,$2,$6,a[$1,$2]}' het.txt miss.imiss > het.imiss.txt # This will create a file called het.imiss.txt with the columns: FID IID F_MISS prop_HET WGACON 1 0.009572 0.289072 # Go back to “plink-qc.R” # Plot the heterozygosity histogram # Plot the sample call rate # How many individuals have more than 5% missing genotyped data? ~~~~~~~~~~~~~~~~~~~~~~~~ Cleaning variants # Still in “plink-qc.R” # Plot the histogram of genetic variants missing data # Remove variants that are missing information from too many individuals plink --bfile cc.qc3 --geno 0.05 --make-bed --out cc.qc4 # How many SNPs were removed? # When cleaning data collected from case-control, # check that variants are not missing disproportionately between cases and controls. plink --bfile cc.qc4 --test-missing --out case-control # This awk code will copy any variants that differ in missingness with a p value < .00001 into a drop file awk '$5<=0.00001' case-control.missing > case-control.missing.drop # Obtain MAF information plink --bfile cc.qc3 --freq --out maf # In “plink-qc.R” # Plot the histogram of the MAF # Remove variants with minor allele frequency (MAF) < .01 and Hardy Weinberg Equation (HWE) deviation < 1e-6 # The default HWE check on case-control data is to only conduct the check on the controls. # We have overwritten that here with the [include-nonctrl] flag. plink --bfile cc.qc4 --maf 0.01 --hwe 1e-6 include-nonctrl --make-bed --out cc.qc5 # How many SNPs were removed due to HWE < 1e-6? # How many SNPs were removed due to MAF < 0.01? # After QC on variants, how many variant and individuals remain? ~~~~~~~~~~~~~~~~~~~~~~~~ Cleaning individuals # Remove individuals with missing information on more than 5% of the variants plink --bfile cc.qc5 --mind 0.05 --make-bed --out cc.qc6 # How many individuals were removed? # Obtain the heterozygosity scores now that the data variant have been cleaned. plink --bfile cc.qc6 --het --out het2 echo "FID IID obs_HOM N_SNPs prop_HET" > het2.txt awk 'NR>1{print $1,$2,$3,$5,($5-$3)/$5}' het2.het >> het2.txt awk 'NR>1{sum+=$5;sq+=$5^2}END{avg=sum/(NR-1);print avg-3*(sqrt(sq/(NR-2)-2*avg*(sum/(NR-2))+(((NR-1)*(avg^2))/(NR-2)))),avg+3*(sqrt(sq/(NR-2)-2*avg*(sum/(NR-2))+(((NR-1)*(avg^2))/(NR-2))))}' het2.txt awk '$5<=0.29957 || $5>= 0.326526' het2.txt > het.drop # Remove the 3 people outside 3SD from the mean heterozygosity rate plink --bfile cc.qc6 --remove het.drop --make-bed --out cc.clean ~~~~~~~~~~~~~~~~~~~~~~~~ Check Relatedness # First prune SNPs and select chr 1-22, this creates a file of SNPs pruned to be independent plink --bfile cc.clean --chr 1-22 --indep-pairwise 1000 5 0.2 --out indep # Calculate identity by descent (IBD) on those pruned SNPs plink --bfile cc.clean --extract indep.prune.in --genome --out ibd less ibd.genome # Z0 Z1 Z2 = the pair of IDs share 0, 1 or 2 alleles by descent # Ideally 1,0,0 = unrelated; 0,1,0 = parent-child; .25,.5,.25 = siblings; etc # pihat = proportion IBD = P(IBD=2)+0.5*P(IBD=1) # Obtain a list of individuals related more than a pihat of .2 plink --bfile cc.clean --extract indep.prune.in --genome --min 0.2 --out pihat # Check them in the file pihat.genome