#! /usr/bin/perl ##################################################### # Fasta # | # Align # # By # Antony Vincent # (a.vincent.0312@gmail.com) # # FastAlign is a perl script which uses the heuristic method # of tfasty36 for find similarity between a query sequence # in amino acids and a sequence in nucleotides. It provides # a more intuitive output to find exon-intron junctions. # The query string is in amino acids and the hit string is # in nucleotides. There are extra nucleotides at the end of # the hit string (option -diff and by default = 10), that # allow to verify if the intron start with common rules # (5'-GTGCGA-... for group II intron and after an exonic T # for group I intron). # # The FASTA version can be changed by the user by changing # the line with tfasty36 for tfastyXX. # # If you have Emboss, you can genarate a graphic with option # -graph 1. # # For complete help: type perl fastalign.pl -help # Last Update: 01/06/13 ####################################################### =head1 Copyright (C) 2013 Antony Vincent Licence: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. =cut use strict; use Bio::SearchIO; use Bio::SeqIO; use Getopt::Long; use Bio::SeqUtils; ## Set the default variables my $identity = 75; my $length = 50; my $diff = 10; my $out = 'output'; my $graphic = 10; my $query; my $library; my $help; GetOptions( 'seq=s' => \$query, 'db=s' => \$library, 'graph=s' => \$graphic, 'i=i' => \$identity, 'l=i' => \$length, 'diff=s' => \$diff, 'out=s' => \$out, 'help!' => \$help, ) or die "Incorrect usage! Try perl fastalign.pl -help for an exhaustif help.\n"; ### if( $help ) { # if start print "\n"; print "Two options are required:\n"; print " -seq: Your sequence in amino acids\n"; print " -db: The sequence in nucleotides (Could be a whole genome or a partial sequence...)\n"; print "\n"; print "There are few miscellaneous options:\n"; print " -i: Minimum identity percentage (default = 75)\n"; print " -l: Minimum match length (default = 50)\n"; print " -diff: Difference between the hit string and the alignement (default = 10)\n"; print " -out: Name of the output file (default = output.txt)\n"; print " -graph: If this option = 1, a graph will be generated (default = no)\n"; } # if help else { # else start my $date = `date`; open (WRITE, ">>$out.txt"); ## Open the output file print WRITE " Fasta\n"; print WRITE " |\n"; print WRITE " Align\n\n"; print WRITE "Date:", $date, "\n"; print WRITE "PARAMETERS\n"; print WRITE "Minimum identity =", $identity, "\n"; print WRITE "Minimum length =", $length, "\n"; print WRITE "Diff =", $diff, "\n\n"; if ( $graphic == 1 ) { open (WRITE, ">>$out.txt"); ## Open the output file open (WRITE2, ">>lindna.lnp"); ## Open the lindna config file ## start the lindna header print WRITE2 "start"; print WRITE2 "\t"; print WRITE2 "1"; print WRITE2 "\n"; print WRITE2 "End"; print WRITE2 "\t"; my $seqio_obj = Bio::SeqIO->new(-file => "$library", -format => "fasta" ); my $seq_obj = $seqio_obj->next_seq; my $count_obj = $seq_obj->length; print WRITE2 "$count_obj"; print WRITE2 "\n\n"; print WRITE2 "group"; print WRITE2 "\n"; } else { print "No graphic generated \n"; } ## run tfasty36 my $fh; my $fasta = "tfasty36"; # <------ You can change this line for newest fasta algorithm my $command = "$fasta $query $library"; open $fh,"$command |" || die("cannot run fasta cmd of $command: $!\n"); my $searchio = Bio::SearchIO->new(-format => 'fasta', -fh => $fh); print WRITE "Fasta algorithm:", $fasta, "\n\n"; ## start the parsing part of the script while( my $result = $searchio->next_result ) { ## $result is a Bio::Search::Result::ResultI compliant object while( my $hit = $result->next_hit ) { ## $hit is a Bio::Search::Hit::HitI compliant object while( my $hsp = $hit->next_hsp ) { ## $hsp is a Bio::Search::HSP::HSPI compliant object if( $hsp->length('total') > $length ) { if ( $hsp->percent_identity >= $identity ) { ## Generals informations print WRITE "Rank=", $hsp->rank, "\n", "Query=", $result->query_name, "\n", "Hit=", $hit->name, "\n" , "Length=", $hsp->length('total'), "\n", "Percent_id=", $hsp->percent_identity, "\n", "Strand=", $hsp->strand('hit'), "\n"; print WRITE "\n"; ## Search for nucleotide sequences print WRITE "\n"; my $start_hit = $hsp->start('hit'), "\n"; my $end_hit = $hsp->end('hit') , "\n"; my $in = Bio::SeqIO->new(-file => "$library" , '-format' => 'fasta'); while ( my $seq = $in->next_seq() ) {#1 ## looking for query position my $start_query = $hsp->start('query'), "\n"; my $end_query = $hsp->end('query') , "\n"; ## aa_to_3aa my $seqobj = Bio::PrimarySeq->new ( -seq => $hsp->query_string); my $polypeptide_3char = Bio::SeqUtils->seq3($seqobj); ## modify the homology string my $homo = $hsp->homology_string; $homo =~ s/:/|||/g; $homo =~ s/\./***/g; $homo =~ s/ /XXX/g; ## HSP print WRITE "Query($start_query,$end_query)\n"; print WRITE "Hit($start_hit,$end_hit)\n\n"; print WRITE $polypeptide_3char, "\n"; print WRITE $homo, "\n"; print WRITE $seq->subseq($start_hit,$end_hit+$diff), "\n"; if ( $graphic == 1) { ## if start ## write in lindna file print WRITE2 "label", "\n", "Block", "\t", "$start_hit", "\t", "$end_hit", "\t", "3", "\t", "H", "\n"; print WRITE2 "Exon", $hsp->rank, "\n"; print WRITE2 "endlabel"; print WRITE2 "\n\n"; } ## if end else {print "No lindna file generated\n";} } #1 print WRITE "\n"; } } } } } if ( $graphic == 1) { ## if start print WRITE2 "endgroup"; system ("lindna -infile lindna.lnp -ruler y -blocktype filled -graphout svg"); system ("rm *.lnp"); } ## if end } # else end