#!/usr/bin/perl
# faster blasting with starting several searches at the same time
# depending on the available threads.

use strict;
use warnings;
use Bio::Tools::Run::myStandAloneBlastPlus;
use Bio::SeqIO;
use Bio::DB::Fasta;
use Getopt::Long;
use threads;
use POSIX qw(ceil floor);


# default value for blast method set to BLASTN
# if blastx or blastp is chosen instead a protein DB must be provided
my $blast_method='blastn';
my $eval=0.01;

# other option variabes
my ($opts,$inseq,@blastdb,@species,$num_threads,$num_avail_threads,$outf,$wsize);

$opts=GetOptions("in=s"=>\$inseq,
				 "db=s{1,}"=>\@blastdb,
				 "sp=s{1,}"=>\@species,
				 "out=s"=>\$outf,
				 "bm=s"=>\$blast_method, # blastn (nucl vs. nucl) or blastx (nucl vs. protein)
				 "nat=i"=>\$num_avail_threads,
				 "nt=i"=>\$num_threads, # num of threads per search
				 "e=f"=>\$eval, # evalue
				 "w=i"=>\$wsize
				 );

				 
my ($num_seqs,$num_seqs_per_thread,$num_threads_to_exec);
$num_seqs=`grep -cP '^>' $inseq`;
chomp($num_seqs);
$num_threads_to_exec=$num_avail_threads/$num_threads;
$num_seqs_per_thread=ceil($num_seqs/$num_threads_to_exec);
print "NSEQS: $num_seqs :\n";
print "NTHREADS: $num_threads_to_exec :\n";
print "NSEQ_PER_THREAD: $num_seqs_per_thread :\n";

my $res;
print "splittin the fasta for faster blasting, hopefully\n";
$res=`split_fasta.pl -i $inseq -p $num_threads_to_exec`;

# add headers to MASTER BLAST RESULTs FILE
my $fho;
open($fho,'>',$outf) or die "problems $outf:$!\n";

my $s;
print $fho 'CONTIG';
for $s(@species){
	print $fho join("\t","\tLENGTH_Q",'LENGTH_S','LENGTH_HIT','E-Val','FRAC_IDENT','Q_COV','S_COV','CHECK','ACCESSION',$s),"\n";
}

close $fho;

my $szbdb=scalar(@blastdb);

# store the threads
my @threads;

# things to clean
my @to_clean;

# get base name for creating my sub-fasta files
$inseq=~/(.*?)\.fasta/;
my $fa_out_base=$1;

for (my $i=1; $i <= $num_threads_to_exec ; $i++){
	# sub-fasta input file for each thread
	my $fa_in=$fa_out_base.'_p'.$i.'.fasta';
	# blast result from each thread
	my $bl_out='blast_res_p'.$i.'.txt';
	# tempdir for each thread
	my $tempdir='Tempdir_p'.$i;
	# temp.fas for each thread
	my $tempfas='Temp_fas_p'.$i;
	# create a temp dir in which the FAS and BLAST report are stored
	my $res=`mkdir $tempdir`;
	
	push @to_clean, $tempdir;
	push @to_clean, $bl_out;
	push @to_clean, $fa_in;
	# create the stream for each thread
	my $seqstr=Bio::SeqIO->new(-file=>"<$fa_in",-format=>"fasta");
	# start the thread
	my $thr= threads->create({'void'=>1},\&worker,$seqstr,$bl_out,$i,$tempdir,$tempfas);
	push @threads,$thr;
}
for my $trd(@threads){
	$trd->join();
}

for (my $i=1; $i <= $num_threads_to_exec ; $i++){
	my $bl_out='blast_res_p'.$i.'.txt';
	$res=`cat $bl_out >> $outf`;
}

# cleaning temp dirs and files
cleanup_temp_files(\@to_clean);

###################
###### SUBS #######
###################	

sub cleanup_temp_files{
	my $arr=shift;
	my $res;
	for my $i(@$arr){
		$res=`rm $i` if (-f $i);
		$res=`rm -rf $i` if (-d $i);
	} 
}

sub worker{
	my ($in_stream,$bl_out,$num,$tempdir,$tempfas)=@_;
	my ($seq,$tmpout,$fhb,$randstr);
	
	# create a factory for each BLAST DB provided
	my (@factories,$factory);
	for my $d(0..($szbdb-1)){
#	print "$d\n";
	$factory=create_FAC($blastdb[$d]);
	push @factories,$factory;
}
	
	# create tmpout for each factory, storing the blast report from the factory
	my @tmpouts;
	for my $fac(0..(scalar(@factories)-1)){
		$tmpout='tmpout_'.$fac.'.txt';
		push @tmpouts,$tmpout;
	}
	
	print " Thread: $num : started!\n";
	open($fhb,'>',$bl_out) or die "problems $bl_out:$!\n";
	
	# do the actual blasting
	my @info;
	while($seq=$in_stream->next_seq){
		push @info,$seq->display_id;
		for my $fac(0..($szbdb-1)){
#			print "TMPOUT: $tmpout :\n'";
			push @info,do_blast($factories[$fac],$seq,$tmpouts[$fac],$tempdir,$tempfas);
		}
		print $fhb join("\t",@info,"\n");
#		print join("\t",@info,"\n");
#		<STDIN>;
		@info=();
	}

	close $fhb;
	print " Thread: $num :finished!\n";
}

sub set_seqstream{
	my $file=shift;
	my $seqin;
	# sets the input stream of seqs
	$seqin=Bio::SeqIO->new(-file => $file,
						   -format => 'Fasta');
	return $seqin;
}

sub create_FAC{	
	my ($db,$tempdir)=@_;
	my $factory;
	# sets the DB to be blasted against
	$factory=Bio::Tools::Run::StandAloneBlastPlus->new(-db_name=>$db);
	return $factory;
}

sub do_blast{
	my ($factory,$seq,$tmpout,$tempdir,$tempfas)=@_;
	my (@ret_hits,$num_hits,$queryname,$report,$hit,$word_s);
#	my($bits,$eval,$len,$hitname,$hsp,$hit_seq,$ret_hit_seq);
# blasting and parsing the result here
	$queryname=$seq->display_id;
	
	if($wsize){
		$word_s=$wsize;
	}elsif($blast_method eq 'blastn'){
		$word_s=11;
	}elsif($blast_method eq 'blastx' || $blast_method eq 'blastp' ){
		$word_s=3;
	}

#	print "BLASTING CONTIG:$queryname:",$seq->length,": DB:",$factory->db_name,"\n";
	$report=$factory->$blast_method(-query=>$seq,
							 -outfile=>"$tempdir/$tmpout",
							 -task=>$blast_method,
							 -method_args=>['evalue'=> $eval,
											'word_size'=>$word_s,
											'soft_masking'=>'TRUE',
											'num_threads'=>$num_threads,
											'max_target_seqs'=>5,
											],
							 -tempdir=>$tempdir,
							 -tempfas=>$tempfas);
#<STDIN>;
#	print "IN DO BLAST\n";
	$num_hits=$report->num_hits;
	if($num_hits < 1){
#		print "NO HITS\n";
		$factory->cleanup;
		return ($seq->length,'---','---','---','---','---','---','---','---','no_hits');
	}
	my($bits,$eval,$len);

	if($num_hits >= 1){
		$hit=$report->next_hit;
		get_hits($hit,\@ret_hits,$seq->length);
	}
	
#	<STDIN>;
	$factory->cleanup;
	return @ret_hits;
}


sub get_hits{
	my($hit,$ret_hits,$query_len)=@_;
	my($bits,$eval,$hitname,$hsp,$hit_seq,$ret_hit_seq,$desc,$accn,$frac_iden,$num_iden,$num_cons,$perc_iden);
	$hitname=$hit->name;
	$hitname=~/\|(\S+)/;
	$hitname=$1;
	$hitname=~s/\|$//;
	$bits=$hit->bits;
	$eval=$hit->significance;
	$desc=$hit->description;
	$frac_iden=$hit->frac_identical;
	# query coverage
	my $qcov=$hit->frac_aligned_query;
	# subject coverage
	my $scov=$hit->frac_aligned_hit;
	# length of the subject
	my $hit_len=$hit->length;
	# length of the alignment
	my $aln_len=$hit->length_aln;
	
	my $check='FAILED';
	if($query_len > $hit_len){
		if(($scov >= 0.5) && ($frac_iden > 0.6)){
#			print "OK SCOV > 50 % it is $scov\n";
			$check='PASSED';
		}
	}elsif($query_len < $hit_len){
		if(($qcov >= 0.5) && ($frac_iden > 0.6)){
#			print "OK QCOV > 50 % it is $qcov\n";
			$check='PASSED';
		}
	}
	
	push @{$ret_hits},$query_len;
	push @{$ret_hits},$hit_len;
	push @{$ret_hits},$aln_len;
	push @{$ret_hits},$eval;
	push @{$ret_hits},$frac_iden;
	push @{$ret_hits},$qcov;
	push @{$ret_hits},$scov;
	push @{$ret_hits},$check;
	push @{$ret_hits},$hitname;
	push @{$ret_hits},$desc;
	

}