#!/usr/bin/env perl

use strict;
use warnings;
use autodie;
use File::Basename;
use Data::Dumper;

use constant DIR => 'analysis';

if (@ARGV < 2) {
	print "Usage: $0 <VCF> <DIR> [<BLACKLIST>]\n";
	print "DIR: directory with sandy files\n";
	print "BLACKLIST: file with retrocopies\n";
	exit 0;
}

my ($vcf_file, $sandy_dir, $black_file) = @ARGV;

my ($rtcs_a, $genotype_h) = index_vcf($vcf_file);
#print Dumper($rtcs_a);
#print Dumper($genotype_h);
my @black_list;
my %b;

if (defined $black_file) {
	open my $fh, "<", $black_file;
	%b = map { chomp; $_ => 1 } <$fh>;
	for my $rtc (@$rtcs_a) {
		push @black_list, 0;
		for (keys %b) {
			if (/$rtc->{pg}/) {
				$black_list[-1] = 1;
				last;
			}
		}
	}
	close $fh;
}

my @sandy_files = glob "$sandy_dir/*.sandy";
#print "@sandy_files\n";

mkdir DIR unless -d DIR;

for my $sandy_file (@sandy_files) {
	my $sandy = basename $sandy_file, '.sandy';
	my $genotype = $genotype_h->{$sandy};
	if (defined $genotype) {
		compare($rtcs_a, $genotype, $sandy_file, $sandy, \@black_list);
	} else {
		warn "$sandy not found at \$genotype_h hash";
	}
}

sub compare {
	my ($rtc_a, $genotype_a, $sandy_file, $sandy, $black_list_a) = @_;

	my $sandy_rtc_h = index_sandy($sandy_file);
	open my $fh, ">" => DIR . "/$sandy.tsv";

	for my $i (0..$#$rtc_a) {
		next if $genotype_a->[$i] eq 'HOR';
#		next if $black_list_a->[$i];

		my $rtc_h = $rtc_a->[$i];
		my $srtc_h;

		my $pg = '';
		for (split /\//, $rtc_h->{pg}) {
			if ($sandy_rtc_h->{$_}) {
				$srtc_h = delete $sandy_rtc_h->{$_};
				$pg = $_;
				last;
			}
		}

		next if $b{$pg};

		if (defined $srtc_h) {
			print $fh "$srtc_h->{chr}\t$srtc_h->{pos}\t$pg\t$srtc_h->{strand}\t$srtc_h->{genotype}";
		} else {
			print $fh "-\t-\t-\t-\t-";
		}

		print $fh "\t$rtc_h->{chr}\t$rtc_h->{pos}\t$rtc_h->{pg}\t$rtc_h->{strand}\t$genotype_a->[$i]\n";
	}

	if (%$sandy_rtc_h) {
		for my $pg (keys %$sandy_rtc_h) {
			next if $b{$pg};
			my $srtc_h = $sandy_rtc_h->{$pg};
			print $fh "$srtc_h->{chr}\t$srtc_h->{pos}\t$pg\t$srtc_h->{strand}\t$srtc_h->{genotype}";
			print $fh "\t-\t-\t-\t-\t-\n";
		}
	}

	close $fh;
}

sub index_sandy {
	my $sandy_file = shift;
	open my $fh, "<" => $sandy_file;

	my %rtc;

	while (<$fh>) {
		chomp;
		next if /^#/;

		my @f = split /\t/;
		my ($pg, $strand, $genotype) = split /_/, $f[2];

		$rtc{$pg} = {
			chr       => $f[0],
			pos       => $f[1],
			strand    => $strand,
			genotype  => $genotype
		};
	}

	close $fh;
	return \%rtc;
}

sub index_vcf {
	my $file = shift;
	open my $fh, "<" => $file;

	my (@rtcs, %genotype, @genotype_idx);

	while (<$fh>) {
		chomp;
		next if /^##/;

		my @f = split /\t/;

		my @genotypes = splice @f, 9;

		if (/^#/) {
			@genotype_idx = @genotypes;
			next;
		}

		for my $i (0..$#genotypes) {
			my $genotype = $genotypes[$i];
			my $haplo = 'HOR';
			if ($genotype =~ /(0\/1|1\/0)/) {
				$haplo = 'HET';
			} elsif ($genotype =~ /1\/1/) {
				$haplo = 'HOA';
			}
			push @{ $genotype{$genotype_idx[$i]} } => $haplo;
		}

		my %rtc = (
			chr       => $f[0],
			pos       => $f[1],
			imprecise => 0,
			strand    => '.'
		);

		my @infos = split /;/, $f[7];

		for my $info (@infos) {
			my ($key, $value) = split /=/, $info;
			if ($key eq 'PG') {
				$rtc{pg} = $value;
			} elsif ($key eq 'POLARITY') {
				$rtc{strand} = $value eq '+' ? 'p' : 'm';
			} elsif ($key eq 'IMPRECISE') {
				$rtc{imprecise} = 1;
			}
		}

		push @rtcs => \%rtc;
	}

	close $fh;
	return (\@rtcs, \%genotype);
}
