#!/usr/local/bin/perl

# a program to convert plink files to bayescan files for two populations 
# Abdel Abdellaoui (a.abdellaoui@vu.nl)

die "Usage: convert_to_bayescan.pl <plink input file without extension [tped & tfam format]> <output file>\n" 
        unless @ARGV == 2;

$output1 = "@ARGV[1]";
$output2 = "@ARGV[1]" . ".snpids";

open OUT1 , ">$output1" or die "Can't open $output: @ARGV[1]!\n";
open OUT2 , ">$output2" or die "Can't open $output: @ARGV[1].snpids!\n";

my $input_tfam = "@ARGV[0]" . ".tfam";
my $input_tped = "@ARGV[0]" . ".tped";
my $input_bim = "@ARGV[0]" . ".bim";

############################ input files ##########################

# The input file must be made in plink using "--recode --transpose" and "--make-bed".

# The .tfam file consists of the following columns:
# 0: Family ID
# 1: Individual ID
# 2: Paternal ID
# 3: Maternal ID
# 4: Sex (1=male; 2=female; other=unknown)
# 5: Population (store the two populations here as 1 and 2)

# The .bim file consists of the following columns:
# 0: chromosome (1-22, X, Y or 0 if unplaced)
# 1: rs# or snp identifier
# 2: Genetic distance (morgans)
# 3: Base-pair position (bp units)
# 4: A1
# 5: A2

# The .tped file consists of the following columns:
# 0: chromosome (1-22, X, Y or 0 if unplaced)
# 1: rs id
# 2: Genetic distance (morgans)
# 3: Base-pair position (bp units)
# 4: A1 - Ind1
# 5: A2 - Ind1
# 6: A1 - Ind2
# 7: A2 - Ind2
# 8: A1 - Ind3
# etc..

####################### read in individual file #####################

my $nind = 0;

my %IID = ();
my %FID = ();
my %ID = ();
my %ind = ();
my %pop = ();

open (READ1, $input_tfam) || die "Cannot open $input_tfam\n";

	while ($line=<READ1>) 
		{
		my @splitFAMline = split (/\s+/  , $line);		# create array 

		$nind++;

		$FID{$nind} = @splitFAMline[0];
		$IID{$nind} = @splitFAMline[1];
		$ID{$nind} = $FID . "-" . $IID;		
		$pop{$nind} = @splitFAMline[5];		

		undef @splitFAMline;
		}
close READ1;

$totalind = $nind;

#####################################################################

########################## read in bim file #########################

my $nsnps = 0;

open (READ2, $input_bim) || die "Cannot open $input_tped\n";

	while ($line=<READ2>) 
		{
		$nsnps++;
		my @splitBIMline = split (/\s+/  , $line);		# create array 

		$chr{$nsnps} = shift(@splitBIMline); #[0]
		$rsid{$nsnps} = shift(@splitBIMline); #[1]
		$skipPEDelement = shift(@splitBIMline); #[2]
		$bp{$nsnps} = shift(@splitBIMline); #[3]
		$A1{$nsnps} = shift(@splitBIMline); #[4]
		$A2{$nsnps} = shift(@splitBIMline); #[5]
		$A1_pop1_total{$nsnps} = 0;
		$A2_pop1_total{$nsnps} = 0;
		$A1_pop2_total{$nsnps} = 0;
		$A2_pop2_total{$nsnps} = 0;
		}

$totalsnps = $nsnps;

close READ2;

#####################################################################

########################## read in SNP file #########################

$nsnps = 0;

open (READ3, $input_tped) || die "Cannot open $input_tped\n";

	while ($line=<READ3>) 
		{
		$nsnps++;
		my @splitPEDline = split (/\s+/  , $line);		# create array 

		$chr = shift(@splitPEDline); #[0]
		$rsid = shift(@splitPEDline); #[1]
		$skipPEDelement = shift(@splitPEDline); #[2]
		$bp = shift(@splitPEDline); #[3]

		print "SNPs are not in the same order in bim and tped file/n" and exit if ($rsid ne $rsid{$nsnps});

		# store genotypes for all individuals:
		for($i=1;$i<$totalind;++$i)
			{
			$A1_ind = shift(@splitPEDline);
			$A2_ind = shift(@splitPEDline);

			if (($A1_ind eq $A1{$nsnps}) && ($pop{$i} == 1))
				{
				$A1_pop1_total{$nsnps}++;
				}

			if (($A1_ind eq $A2{$nsnps}) && ($pop{$i} == 1))
				{
				$A2_pop1_total{$nsnps}++;
				}

			if (($A2_ind eq $A1{$nsnps}) && ($pop{$i} == 1))
				{
				$A1_pop1_total{$nsnps}++;
				}

			if (($A2_ind eq $A2{$nsnps}) && ($pop{$i} == 1))
				{
				$A2_pop1_total{$nsnps}++;
				}

			if (($A1_ind eq $A1{$nsnps}) && ($pop{$i} == 2))
				{
				$A1_pop2_total{$nsnps}++;
				}

			if (($A1_ind eq $A2{$nsnps}) && ($pop{$i} == 2))
				{
				$A2_pop2_total{$nsnps}++;
				}

			if (($A2_ind eq $A1{$nsnps}) && ($pop{$i} == 2))
				{
				$A1_pop2_total{$nsnps}++;
				}

			if (($A2_ind eq $A2{$nsnps}) && ($pop{$i} == 2))
				{
				$A2_pop2_total{$nsnps}++;
				}

			}

		undef @splitPEDline;

		}
close READ3;

#####################################################################

########################## print output file #########################

print OUT1 "[loci]=$nsnps\n\n";
print OUT1 "[populations]=2\n\n";
print OUT1 "[pop]=1\n";

foreach $key (sort {$a <=> $b} keys %rsid) {
	$totalgenes = $A1_pop1_total{$key}+$A2_pop1_total{$key};
	print OUT1 "$key	$totalgenes	2	$A1_pop1_total{$key}	$A2_pop1_total{$key}\n";
	print OUT2 "$key	$rsid{$key}\n";
}

print OUT1 "\n";
print OUT1 "[pop]=2\n";

foreach $key (sort {$a <=> $b} keys %rsid) {
	$totalgenes = $A1_pop2_total{$key}+$A2_pop2_total{$key};
	print OUT1 "$key	$totalgenes	2	$A1_pop2_total{$key}	$A2_pop2_total{$key}\n";
}

######################################################################

