#!/bin/perl use strict; use vars qw($USAGE); # random sequence generator # # -c=1 option will cause prot sequences to be built # using vertebrate aa frequencies, # with option -a putting a 1st methionine residues on. Frequencies are # calculated from the NCBI human RefSeq protein sequences # -c and -a only affect protein sequences # -a only works in conjunction with -c # -n number of random sequences, default = 1 use Bio::PrimarySeq; use Bio::SeqIO; use Getopt::Long; my ($length,$type,$filename,$comp,$met); $USAGE = 'usage: generate_random_seq.pl --length=1000 --type=dna --filename=/tmp/test.seq --number=50'; my %alphabets = ( 'dna' => [qw(C A G T)], 'rna' => [qw(C A G U)], 'prot'=> [qw( A C D E F G H I K L M N P Q R S T V W Y)], ); # make random num from 1-10000. numbers in this array reflect the frequency, # e.g., a random number from 1.744 = A, 745-991 = C etc; my @aa_frequencies = qw(744 991 1398 2017 2378 3104 3349 3726 4239 5273 5443 5749 6410 6848 7455 8263 8760 9340 9488 9713 10000); my $number = 1; &GetOptions ( 'l|length:s' => \$length, 't|type|m|alphabet:s' => \$type, 'f|file|filename:s' => \$filename, 'c|composition:s' => \$comp, 'a|methionine:s' => \$met, 'n|number:s' => \$number ); assert ( $type && defined ($alphabets{lc $type}), $USAGE); assert ( $length && $length =~ /^\d+$/, $USAGE ); foreach my $num (1..$number) { my $sequence = ""; my $alphabet = $alphabets{lc $type}; my $sspace = scalar @$alphabet; if (!$comp || $type ne 'prot') { foreach ( 1..$length ) { $sequence .= $alphabet->[ int rand($sspace) ]; } }elsif ($type eq 'prot') { $sequence = build_seq($length, \@aa_frequencies); } my $seq = Bio::PrimarySeq->new(-seq => $sequence, -display_id => 'randomseq'.$num); my %args = (-format => 'fasta'); if( $filename ) { $args{-file} = ">>$filename" } my $seqio = Bio::SeqIO->new(%args); $seqio->write_seq($seq); } sub assert { die $_[1] unless( $_[0] ); } sub build_seq { #takes seqlen and ref to frequency data as parameters my ($len, $pf) = @_; my $str = ($met)?'M':''; my $i = ($met)?1:0; for ($i..$len-1) { my $aa = int(rand (10000)) ; my $j = 0; while ($pf->[$j] < $aa && $j <19) { $j++; } $str .= $alphabets{'prot'}[$j]; } print "str is $str\n"; return $str; }