#!/usr/bin/env perl
# $Id: extract_mRNA,v 1.4 2007/01/25 14:25:13 c4chris Exp $

################################################################################
#
# extract_mRNA
# ------------
#
# Claudio Lottaz, SIB-ISREC, Claudio.Lottaz@isb-sib.ch
# Christian Iseli, LICR ITO, Christian.Iseli@licr.org
#
# Copyright (c) 2006 Swiss Institute of Bioinformatics. All rights reserved.
#
################################################################################

use strict;
use EMBLFile;
use GBFile;
use Symbol;

# global variables
my $verbose = 1;

my $datadir = '.';
my $filestem = '';

require "build_model_utils.pl";

################################################################################
#
#   Check command-line for switches
#

my $usage = "Usage: extract_mRNA [options] <conf-files>\n" .
    "       where options are:\n" .
    "        -q       don't log on terminal\n" .
    "More information is obtained using 'perldoc extract_mRNA'\n";

while ($ARGV[0] =~ m/^-/) {
    if ($ARGV[0] eq '-q') { shift; $verbose = 0; next; }
    die "Unrecognized switch: $ARGV[0]\n$usage";
}
if ($#ARGV < 0) { die "No configuration file specified\n$usage"; }

################################################################################
#
#   Main-loop through all specified config-files
#

my $parFile;
while($parFile = shift) {
    my($organism, $hightaxo, $dbfiles, $ugdata, $estdata, $datadir2, $filestem2,
       $rnafile, $estfile, $estcdsfile, $estutrfile, $trainingfile, $testfile,
       $utrfile, $cdsfile, $tuplesize, $minmask, $pseudocounts, $minscore,
       $startlength, $startpreroll, $stoplength, $stoppreroll, $smatfile,
       $estscanparams, $nb_isochores, $isochore_borders) =
	   readConfig($parFile, undef, undef, undef,
		      undef, undef, undef,
		      undef, undef, $verbose);
    log_open("readconfig.log");
    $datadir = $datadir2; $filestem = $filestem2;
    showConfig($parFile, $organism, $hightaxo, $dbfiles, $ugdata, $estdata, $datadir,
	       $rnafile, $estfile, $estcdsfile, $estutrfile, $trainingfile,
	       $testfile, $utrfile, $cdsfile, $tuplesize, $minmask,
	       $pseudocounts, $minscore, $startlength, $startpreroll,
	       $stoplength, $stoppreroll, $smatfile, $estscanparams,
	       $nb_isochores, $isochore_borders);
    log_close();

    # extract data
    log_open("extract_data.log");
    extract_mRNAs($dbfiles, $organism, $hightaxo, $rnafile);
    log_close();
    print "$parFile done.\n";
}

exit(0);

################################################################################
#
#   Extract mRNA entries
#

sub extract_mRNAs {
    # Find all full-length messenger RNA entries of the given species
    # ($organism) in the given databse ($dbfiles) and writes them in
    # FASTA format with CDS annotation in the header ($rnafile). Tries
    # to guess whether RefSeq or EMBL data is provided.

    my($dbfiles, $organism, $hightaxo, $rnafile) = @_;

    log_print("\nExtracting mRNA entries...");
    if (-s $rnafile) {
      log_print(" - $rnafile already exists, skipped");
      return;
    }

    my($file, $fileSeqs, $fileLen);
    my $totSeqs = 0;
    my $totLen = 0;
    my $outfh = gensym;
    open($outfh, ">$rnafile");
    my(@infiles) = glob($dbfiles);
    foreach $file (@infiles) {
	$fileSeqs = 0;
	$fileLen = 0;
	$_ = `head -1 $file`;
	if (m/^LOCUS/) { # we read a file in genbank format
	    my $src = GBFile->new("$file");
	    $src->openStream;
	    log_print(" - processing RefSeq-file $file...");
	    my $e;
	    while(defined ($e = $src->getNext)) {
		my $ac = $ {$e->{_GBac}}[0];
		next if $e->{_GBtype} ne "mRNA";
		next unless $e->{_GBcdsst} =~ /^\d+$/;
		if ($hightaxo eq "") {
		  if (($e->{_GBscn} eq $organism)) {
		    $fileSeqs++;
		    $fileLen += $e->{_GBcdsen} - $e->{_GBcdsst} + 1;
		    print $outfh &genRNAEntry($ac, $e->{_GBdesc},
					      $e->{_GBcdsst}, $e->{_GBcdsen},
					      $e->{_seq});
		  }
		} else {
		  my $o = index($e->{_GBtaxo}, $hightaxo);
		  if ($o >= 0) {
		    $fileSeqs++;
		    $fileLen += $e->{_GBcdsen} - $e->{_GBcdsst} + 1;
		    print $outfh &genRNAEntry($ac, $e->{_GBdesc},
					      $e->{_GBcdsst}, $e->{_GBcdsen},
					      $e->{_seq});
		  } 
		}
	    }
	    close($src->{_BTFfile});
	} else { # we read a file in EMBL format
	    my $src = EMBLFile->new("$file");
	    $src->openStream;
	    log_print(" - processing EMBL-file $file...");
	    my $e;
	    while(defined ($e = $src->getNext)) {
		if ($hightaxo ne "") {
		  my $o = index($e->{_EMBLtaxo}, $hightaxo);
		  if (($o < 0)
		      || ($e->{_EMBLtype} ne "mRNA")) {
		    next; # no mRNA entry
		  }
		} elsif (($e->{_EMBLscn}[0] ne $organism)
			 || ($e->{_EMBLtype} ne "mRNA")) {
		  next; # no mRNA entry
		}
		my($begin, $end);
		foreach(@{$e->{_EMBLft}}) {
		  if (($begin, $end) = $_ =~ /^CDS\s+(<?\d+)\.\.>?(\d+)$/) {
		    last;
		  }
		}
		if (!defined($begin)) {
		  next; # no CDS found
		}
		if ((m/>/) && (m/</)) {
		  next; # frame is undefined
		}
		# all checks passed
		$fileSeqs++;
		$fileLen += $end - $begin + 1;
		print $outfh &genRNAEntry($ {$e->{_EMBLac}}[0],
					  $e->{_EMBLdesc},
					  $begin, $end, $e->{_seq});
	    }
	    close($src->{_BTFfile});
	}
	$totSeqs += $fileSeqs;
	$totLen += $fileLen;
	log_print("   found $fileSeqs sequences, $fileLen coding nucleotides");
    }
    close($outfh);
    log_print(" - overall found $totSeqs sequences, "
	      . "$totLen coding nucleotides");
}

sub genRNAEntry {
    my($id, $desc, $cdsBegin, $cdsEnd, $seq) = @_;
    $seq =~ s/(.{80})/$1\n/g;
    return ">tem|$id CDS: $cdsBegin $cdsEnd $desc\n$seq\n";
}

################################################################################
#
#   Documentation
#

=head1 NAME

extract_mRNA - extract mRNA data to build models for ESTScan

=head1 SYNOPSIS

extract_mRNA [options] <config-files...>

=head1 DESCRIPTION

mRNA data is extracted from files in EMBL or RefSeq format. The script reads
configuration files on the command line and performs the extraction.

=head1 DIRECTORY STRUCTURE

build_model uses the directory structure which is given in the configuration
file.

mRNA data, test and training data files, is deposited in the data-root
directory if not otherwise specified in the configuration file.

=head1 OPTIONS

 -q quiet
    Do not log on terminal.

=head1 CONFIGURATION FILE

The parameters defined in the configuration file have the
following meaning:

 * $organism (mandatory)
    The desired organism as it is given in EMBL "OS" or RefSeq
   "ORGANISM" lines.

 * $dbfiles
   Files from where full-length mRNA sequences are to be extracted,
   tries to guess whether the files come from EMBL or RefSeq. If
   this is not specified, expects a collection of mRNA in $rnafile.

 * $datadir (mandatory)
   Base directory where all of the above files are located
   and the temporary result files are stored

 * $rnafile (default is "$datadir/mrna.seq")
   Name of the file with the extracted mRNA entries.

 * $smatfile (default is "$datadir/Matrices/$filestem.smat")
   Name of the file where the HMM-model is to be written

 * $nb_isochores (default is 0)
   Number of isochores, when isochores are to be determined
   automatically from the GC-content distributaion as equal-sized
   groups. 0 means no automatic detection.

 * @isochore_borders (default is (0, 43, 47, 51, 100)
   Array of GC percentages where isochores are split, first entry is
   usually 0 and last 100. This is overwritten when $nb_isochores is
   not zero.

 * $tuplesize (default is 6)
   Size of tuples counted for codon statistics. This is overwritten by
   the -t switch.

 * $minmask (default is 30)
   Minimal run of consecutive nucleotides masked as redundant. This is
   overwritten by the -m switch.

 * $pseudocounts (default is 1)
   pseudocount to be added when generating the codon usage tables,
   overwritten by -m

 * $minscore (default is -100)
   minimum score attributed to log-odds and log-probabilities,
   overwritten by -s.

 * $startlength, $startpreroll (default 2+ceil(tuplesize/3) and 2)
   number of nucleotide triplets contained in the start profile and
   how many of these are contained in the 5' untranslated region,
   overwritten by -l (length) and -r (preroll)

 * $stoplength, $stoppreroll (default 2+ceil(tuplesize/3) and 2)
   number of nucleotide triplets contained in the stop profile and
   how many of these are contained in the coding sequence,
   overwritten by -L (length) and -R (preroll)

 * $estscanparams (default is "-m -50 -d -50 -i -50 -N 0")
   parameters passed to ESTScan during evaluation

$filestem is used to generate many filenames. It is generated
automatically according to the tuplesize, the minmask and the
pseudocounts applied to generate them.

=head1 AUTHOR

Claudio Lottaz, SIB-ISREC, Claudio.Lottaz@isb-sib.ch

=cut

#
#   End of file
#
################################################################################
