#!/usr/bin/perl ##################################################################### # # # script expects multiple fasta file of mel,sim,yak # # all files have SOME data for all 3 seqs # # mel seq begins >mel_ & yak seq begins >yak_ # # there may be multiple sim sequences, they # # will be separated and an average dN & dS will # # be calculated based on coverage of each line # # # ##################################################################### #use strict; use warnings; use Bio::AlignIO; use Bio::TreeIO; use File::Spec; ######################### MAIN - I.E. FUNCTION CALLS ###################### ##OPEN DIRECTORY WITH FASTA FILES #opendir(DIR, './MSY_IG/IG_fastas/'); opendir(DIR, './'); my @allfiles = readdir DIR; my $numfiles = scalar @allfiles; my $outfile = "IG_basemlDIV.V7.50ntHKY.txt"; open (OUT, ">$outfile") or die "Can't open $outfile: $!\n"; runPAML(); close OUT; exit; ######################################################################### #SUBROUTINES# ######################################################################### #~~~~~~~~~~~~~~~~~~~~~PAML WRAPPER~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub runPAML{ my $a=0; while ($a<$numfiles){ if($allfiles[$a] =~ /.fa$/){ ##these are multiple fasta files with mel, yak, and several sim seqs #my $datafile = './MSY_IG/IG_fastas/'.$allfiles[$a]; my $datafile = './'.$allfiles[$a]; open (DATAINPUT, $datafile) || die "Can't open $datafile: $!\n"; #READ IN DATA my $data; my $melSEQ; my $melDATA; my $yakSEQ=0; my @simID; my @simSEQ; # make the $/ setting local { local $/ = ">"; ; while($data = ){ if ($data =~ /([+A-Za-z0-9]+)(.*)/) { my ($tempID, $tempCG) = ($1,$2); $data =~ s/.*?\n//; # remove defline $data =~ s/\s+//g; # remove whitespace $data =~ s/>//g; # remove > my $tempSEQ = $data; if($tempID eq 'mel'){ $melSEQ = $tempSEQ; $melDATA = $tempCG; } elsif($tempID eq 'yak'){ $yakSEQ = $tempSEQ; } else{ push(@simID, $tempID); push(@simSEQ, $tempSEQ); } } } close DATAINPUT; } if(length $yakSEQ > 49){ ###CALCULATE THE NUMBER OF COVERED SITES my $len = length $melSEQ; my @simCOVER=(0,0,0,0,0,0); my $coverage=0; my $numsim = scalar @simSEQ; my $enoughseq=0; for(my $i=0;$i<$len;$i++){ my $flag=0; my $sites=0; my $yaksite = substr($yakSEQ, $i, 1); if($yaksite ne 'N'){ for(my $j=0;$j<$numsim;$j++){ if((substr($simSEQ[$j], $i, 1)) ne 'N'){ $simCOVER[$j]++; $flag=1; } } } if($flag){ $coverage++; ## there's yak and at least one sim at the site } } for(my $k=0;$k<$numsim;$k++){ if($simCOVER[$k] > 49){ $enoughseq=1; } } if($enoughseq){ my @Sites; my @melDIV; my @simDIV; my @yakDIV; #PARSE TO SEPARATE TEMP FILES - I'm separating out the sim seqs into individual files here ##so that I can get average divergence over all sim seqs; files are PHYLIP format ## I've hard coded in here that there have to be at least 50 sites before I'll call baseml ## and calculate the divergence for(my $i=0;$i<$numsim;$i++){ ### CHECK TO MAKE SURE THERE'S SOME DATA FOR THE PARTICULAR SIM LINE if($simCOVER[$i]>49){ open (SIM, ">./file.phy"); print SIM " 3 $len\n"; print SIM "mel\n$melSEQ\n"; print SIM "$simID[$i]\n$simSEQ[$i]\n"; print SIM "yak\n$yakSEQ\n"; close SIM; system (baseml); #use the file to run baseml - have a set baseml.ctl that looks for file.phy and msy.tre #extract divergence and number of sites from mlb open(MLB, mlb) || die; ; while($data = ){ if ($data =~ /ns\s=\s3\s+\tls\s=\s(\d+)/) { push(@Sites, $1); #print OUT "$1\t"; } elsif($data =~ /\(\S+\:\s([.0-9]+)\,\s\S+\:\s([.0-9]+)\,\s\S+\:\s([.0-9]+)\)\;/){ push(@melDIV, $1); push(@simDIV, $2); push(@yakDIV, $3); #print OUT "$1\t$2\t$3\n"; } } close MLB; } } #AFTER LOOPING THROUGH ALL FILES #CALC TOTAL NUMBER OF SITES my $totalsites; foreach my $sites(@Sites){ $totalsites += $sites; } my @prop; my $seqs = scalar @Sites; for(my $y=0;$y<$seqs;$y++){ $prop[$y] = $Sites[$y]/$totalsites; } #change this so that it doesn't print out 0 for no data #CALC AVERAGE - MUTLIPLY EACH divergence by the PROPORTION OF SITES my $avgmd=0; my $avgsd=0; my $avgyd=0; for(my $z=0;$z<$seqs;$z++){ $avgmd += $prop[$z]*$melDIV[$z]; $avgsd += $prop[$z]*$simDIV[$z]; $avgyd += $prop[$z]*$yakDIV[$z]; } #print "$allfiles[$a]\t$coverage\t$avgmd\t$avgsd\t$avgyd\n"; print OUT "$allfiles[$a]\t$coverage\t$avgmd\t$avgsd\t$avgyd\n"; } } } $a++; } } ############################## END OF FILE #######################################