Sophie: perl-Mail-SpamAssassin-Plugin-WebRedirect-0-6.mga3 src

perl-Mail-SpamAssassin-Plugin-WebRedirect-0-6.mga3.src.rpm

=head1 NAME

Mail::SpamAssassin::Plugin::WebRedirect		version: 20060211

=head1 SYNOPSIS

 loadplugin Mail::SpamAssassin::Plugin::WebRedirect [/path/to/WebRedirect.pm]

 web_redirect_timeout		3

 web_redirect_max_checks	3

 web_redirect_max_size		50000

 web_redirect_host		geocities.com *.geocities.com

 web_redirect_skip_host		example.com *.example.com

 header     WEB_403   eval:WebRedirect_Status(403)
 score      WEB_403   4.0
 describe   WEB_403   Contains a web link that returns 403
 tflags     WEB_403   net

=head1 DESCRIPTION

Fetches web pages linked to in messages and provides their contents in a
pseudo-header that can be used in custom header rules.

An eval function is also provided to test a link's HTTP status code.

Limited decoding of data contained in pages is also attempted.  The decoded
data is provided in an additional pseudo-header that is made available to
custom header rules.

=head1 AUTHOR

Daryl C. W. O'Shea, DOS Technologies <spamassassin@dostech.ca>

=head1 COPYRIGHT

Copyright (c) 2005 Daryl C. W. O'Shea, DOS Technologies. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=head1 WARNING: PRIVACY AND TECHNICAL ISSUES

There are inherent privacy issues involved in automatically querying a link
found in an email.  Doing so can allow the sender to confirm that the email
may have been accepted.

Spammers could use this info to confirm that an address may be valid or that
it may have gotten through to an end user, etc.

Legitimate senders may see the request in their server logs and believe that
an end user actually saw the email or visited the link queried, which may not
be the case.  In some cases, doing a query make trigger some unknown action by
the sender, such as may be the case when links are used to confirm an action
requested on behalf of the recipient.

To prevent such issues the plugin makes an effort to remove obvious query
strings and login tokens, but cannot possibly be aware of rewriting techniques
that could be used by a sender that controls the HTTP server serving the
resource being linked to.  As such you MUST ONLY INCLUDE THE DOMAINS OF FREE
WEB HOST PROVIDERS THAT YOU KNOW DO NOT ALLOW USERS TO CONTROL ANY ADVANCED
FEATURES.

=cut

# Version History
#
# 2006-02-11
#  - Replaced status code specific eval test with generic eval test that allows
#    the status code to be user configured.
#  - Check to make sure anything that looks like a directory is queried with a
#    trailing slash to avoid 301 responses pointing to the intended resource.
#  - Flag the privileged config settings as privileged.
#  - Fix some config option validation regexes.
#
# 2005-12-15
#  - Initial release

package Mail::SpamAssassin::Plugin::WebRedirect;

use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Logger;
use strict;
use warnings;
use bytes;

use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin);

use constant HAS_LWP_USERAGENT => eval { require LWP::UserAgent; };

# Load Time::HiRes if it's available
BEGIN {
  use constant HAS_TIME_HIRES => eval { require Time::HiRes };
  if (HAS_TIME_HIRES) {
    Time::HiRes->import( qw(usleep ualarm gettimeofday tv_interval) );
  }
}


sub new {
  my $class = shift;
  my $mailsaobject = shift;

  $class = ref($class) || $class;
  my $self = $class->SUPER::new($mailsaobject);
  bless ($self, $class);

  if ($mailsaobject->{local_tests_only} || !HAS_LWP_USERAGENT) {
    $self->{disabled} = 1;
  } else {
    $self->{disabled} = 0;
  }

  $self->register_eval_rule("WebRedirect_Status");

  unless ($self->{disabled}) {
    $self->{ua} = new LWP::UserAgent;
  }

  $self->set_config($mailsaobject->{conf});

  return $self;
}


sub set_config {
  my($self, $conf) = @_;
  my @cmds = ();

=head1 USER PREFERENCES

There are no settings that can be configured by non-privileged users.

=cut

=head1 RULE DEFINITIONS AND PRIVILEGED SETTINGS

Only users running C<spamassassin> from their procmailrc's or forward files,
or sysadmins editing a file in C</etc/mail/spamassassin>, can use these
settings.   C<spamd> users cannot use them in their C<user_prefs> files, for
security and efficiency reasons, unless C<allow_user_rules> is enabled.

=over 4

=item web_redirect_host		geocities.com	(default: none)

A list of hostname patterns to perform HTTP queries against.  Normal shell
wild cards may be used, similar to those used in <C>whilelist_from entries.

Multiple hostname patterns per line are allowed, as are multiple lines.

 Example:
	web_redirect_host	geocities.com	*.geocities.com
	web_redirect_host	geocities.yahoo.com.??

=cut

  push (@cmds, {
    setting => 'web_redirect_host',
    is_priv => 1,
    default => {},
    code => sub {
      my ($self, $key, $value, $line) = @_;
      if ($value =~ /^$/) {
        return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
      }
      if ($value !~ /^[-.*?\w\s]+$/) {
	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
      }
      foreach my $domain (split(/\s+/, $value)) {
	my $pattern = $domain;
	$domain =~ s/\./\\\./g;
	$domain =~ s/\?/\./g;
	$domain =~ s/\*/\.\*/g;
        $self->{web_redirect_host}->{lc $domain} = $pattern;
      }
    }
  });

=item web_redirect_skip_host	example.com	(default: none)

A list of hostname patterns to skip HTTP queries against.  A link will not be
queried against if its hostname pattern is in this list, even if it is listed
in the <C>web_redirect_host list by either of the site configuration or a
user's configuration.

The syntax used is the same as <C>web_redirect_host.

=cut

  push (@cmds, {
    setting => 'web_redirect_skip_host',
    is_priv => 1,
    default => {},
    code => sub {
      my ($self, $key, $value, $line) = @_;
      if ($value =~ /^$/) {
        return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
      }
      if ($value !~ /^[-.*?\w\s]+$/) {
	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
      }
      foreach my $domain (split(/\s+/, $value)) {
	my $pattern = $domain;
	$domain =~ s/\./\\\./g;
	$domain =~ s/\?/\./g;
	$domain =~ s/\*/\.\*/g;
        $self->{web_redirect_skip_host}->{lc $domain} = $pattern;
      }
    }
  });

=item header Web-Redirect =~ /pattern/modifiers

The <C>Web-Redirect pseudo-header contains the contents of the web pages
queried and can be used in custom header rules as defined in SpamAssassin's
<C>Mail::SpamAssassin::Conf documentation.

=cut

=item header Web-Redirect-Encoded =~ /pattern/modifiers

The <C>Web-Redirect-Encoded pseudo-header contains the decoded contents of one
common encoding method spotted in most (as of mid to late 2005) web redirect
pages found on popular free web hosts.

Its use is identical to that of the <C>Web-Redirect pseudo-header.  When
scoring rules based on this pseudo-header keep in mind that somebody went to
the effort of encoding the contents, possibly to obscure it.

=cut

=head1 ADMINISTRATOR SETTINGS

These settings differ from the ones above, in that they are considered 'more
privileged' -- even more than the ones in the B<PRIVILEGED SETTINGS> section.
No matter what C<allow_user_rules> is set to, these can never be set from a
user's C<user_prefs> file when spamc/spamd is being used.  However, all
settings can be used by local programs run directly by the user.

=over 4

=item web_redirect_timeout	n.nn		(default: 3)

The amount of time in seconds to wait for a response to an HTTP GET request.

=cut

  push(@cmds, {
    setting => 'web_redirect_timeout',
    is_admin => 1,
    default => 3,
    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  });

=item web_redirect_max_checks	n		(default: 3)

The maximum number of links to send HTTP GET requests for.

=cut

  push(@cmds, {
    setting => 'web_redirect_max_checks',
    is_admin => 1,
    default => 3,
    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  });

=item web_redirect_max_size	n		(default: 50000)

The maximum data size to request in an HTTP GET request.  This is a maximum
per request, so multiply by the value of <C>web_redirect_max_checks to get the
maximum amount that may be requested.

When considering a value for this setting, keep in mind that the data received
in the body of each HTTP response will be added to the <C>Web-Redirect
pseudo-header.  Additionally, any encoded data found and decoded will be added
to the <C>Web-Redirect-Encoded pseudo-header.

=cut

  push(@cmds, {
    setting => 'web_redirect_max_size',
    is_admin => 1,
    default => 50000,
    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  });

  $conf->{parser}->register_commands(\@cmds);
}


=item eval:WebRedirect_Status_Line('statuscode')

A function provided for use in eval rules to test the HTTP status of the links
queried.  A file glob style 'status' parameter MUST be included.  The 'status'
parameter will be matched against the HTTP status returned by the remote
server.

Examples:

  header   WEB_404   eval:WebRedirect_Status('404')

=cut 

sub WebRedirect_Status {
  my ($self, $pms, $status) = @_;
  return 0 if $self->{disabled};

  my $eval_name = $pms->get_current_eval_rule_name();

  # validate the eval call and complain if it was done wrong
  unless (defined $status && $status =~ /^\d{3}$/) {
    warn ("rules: eval rule: ". $eval_name 
	." requires a 3 digit status code parameter\n");

    dbg ("config: eval rule: ". $eval_name
	." requires a parameter such as: header "
	. $eval_name ." eval:WebRedirect_Status\(404\)");

    $pms->{rule_errors}++; # flag to --lint that there was an error ...
    return 0;
  }

  $self->_log_hit($pms, $status, $eval_name);
  return 0;
}


# this is the best place to hold up processing since it is usually called just
# after DNS checks are sent out
sub parsed_metadata {
  my ($self, $opts) = @_;
  my $pms = $opts->{permsgstatus};
  my $msg = $opts->{msg};

  return if $self->{disabled};

  my %uris = $self->_get_domains($pms);

  # set maximum page size
  $self->{ua}->max_size(($pms->{main}->{conf}->{web_redirect_max_size} | 50000));

  # set timeout per request
  $self->{ua}->timeout(($pms->{main}->{conf}->{web_redirect_timeout} | 15));

  # array of requested pages text
  my @pagetext = ();
  my @ciphertext = ();

  my $count = 0;
  my $max = $pms->{main}->{conf}->{web_redirect_max_checks};
  foreach my $uri (keys %uris) {
    $count++;
    if ($count > $max) {
      dbg("rules: maximum web_redirect checks reached, skipping remaining domains");
      last;
    }
    dbg("rules: checking uri: $uri");

    my $page = $self->_get_page($pms, $uri);

    if (defined $page) {
      # keep each page's plain-text
      push @pagetext, $page;

      # keep each page's cipher-text
      push @ciphertext, $self->_decode_page($page);
    }
  }

  # add all of the pages to the message's metadata
  $pms->{msg}->put_metadata('Web-Redirect', join('---new-page---', @pagetext));
  $pms->{msg}->put_metadata('Web-Redirect-Encoded', join('---new-page---', @ciphertext));

  dbg("rules: WebRedirect page text: start>>".
	$pms->{msg}->get_metadata('Web-Redirect') ."<<end");
  dbg("rules: WebRedirect decoded text: start>>".
	$pms->{msg}->get_metadata('Web-Redirect-Encoded') ."<<end");

  return;
}


sub _get_page {
  my ($self, $pms, $uri) = @_;

  my $request = new HTTP::Request('GET', $uri);

  # measure elapsed time while getting page
  my $t0 = (HAS_TIME_HIRES ? [gettimeofday] : time());
  my $response = $self->{ua}->simple_request($request);
  my $t1 = (HAS_TIME_HIRES ? [gettimeofday] : time());

  # track status codes
  $self->_track_hit($pms, $1, $uri) if ($response->status_line =~ /^(\d{3})/);

  # debug status line
  dbg("rules: request status: " . $response->status_line);

  # check the outcome of the response
  if ($response->is_success) {
    dbg("rules: got response to request in "
	. (HAS_TIME_HIRES ? (tv_interval $t0, $t1) : $t1 - $t0) . " seconds");
    return $response->content;
  }
  return;
}


sub _decode_page {
  my ($self, $plaintext, $iter) = @_;

  # deep-recursion protection
  $iter ||= 0;
  dbg("rules: _decode_page() iteration $iter");
  return $plaintext if ($iter > 3);

  my $decoded = '';

  # unescape this iteration of plaintext
  while ($plaintext =~ /\bunescape\s*\(\"(.*?)\"\)/ig) {
    my $escaped = $1;

    $escaped =~ s/\\x([0-9A-Fa-f]{2})/chr(hex($1))/eg;
    $escaped =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;

    $decoded .= $escaped;

    dbg("rules: decoded layer \#". ($iter+1) ." of encoding");
  }

  # recurse
  if ($decoded ne '') {
    $decoded .= ' ' . $self->_decode_page($decoded, ++$iter);
  }

  return $decoded;
}


sub _track_hit {
  my ($self, $pms, $rule, $uri) = @_;

  $pms->{WebRedirect}->{$rule}++;

  if (!defined $pms->{WebRedirect}->{hits}->{$rule}) {
    $pms->{WebRedirect}->{hits}->{$rule} = { };
  };
  $pms->{WebRedirect}->{hits}->{$rule}->{$uri} = 1;
}


sub _log_hit {
  my ($self, $pms, $rule, $rulename) = @_;

  if ($pms->{WebRedirect}->{hits}->{$rule}) {
    my $uris = join (' ', keys %{$pms->{WebRedirect}->{hits}->{$rule}});
    $pms->test_log ("URIs: $uris");
    $pms->got_hit ($rulename, "");
  }
}


sub _get_domains {
  my ($self, $pms) = @_;

  my %wanted_uris = ();

  # don't keep dereferencing
  my $domains      = $pms->{main}->{conf}->{web_redirect_host};
  my $skip_domains = $pms->{main}->{conf}->{web_redirect_skip_host};

  # get the full list of parsed domains
  my $uris = $pms->get_uri_detail_list();

  # go from uri => info to uri_ordered
  # 0: a
  # 1: form
  # 2: img
  # 3: !a_empty
  # 4: parsed
  # 5: a_empty
  while (my($uri, $info) = each %{$uris}) {
    # skip if uri doesn't have a type since it'll be in the list again with one
    # or isn't a type we want
    next unless $uri =~ /^http:\/\//i;

    # no domains were found via this uri, so skip
    next unless ($info->{domains});

    my $entry = 3;

    if ($info->{types}->{a}) {
      $entry = 5;

      # determine a vs a_empty
      foreach my $at (@{$info->{anchor_text}}) {
        if (length $at) {
          $entry = 0;
          last;
        }
      }
    }
    elsif ($info->{types}->{parsed} && (keys %{$info->{types}} == 1)) {
      $entry = 4;
    }

    # only use links to web pages, not images, etc.
    if ($entry == 0 || $entry == 4) {

      # check to see if it's a domain we want to check, first get hostname
      # making sure we omit any username@
      my $uri_host;
      $uri =~ /^.{3,4}:\/\/(?:[^\/]*\@)?(.+?)(?:\/|:|$)/;
      if (defined $1) {
	$uri_host = lc($1);
      } else {
        dbg("rules: couldn't determine hostname, skipping uri: $uri");
	next;
      }

      my $check_it = 0;
      # check to see if it's in the list of domains to check
      while (my ($regexp, $simple) = each (%{$domains})) {
	if ($uri_host =~ /^$regexp$/) {	# both already lc
	  dbg("rules: hostname: $uri_host matches check pattern: $simple");
	  $check_it = 1;
#	  last;
        }
      }
      next unless $check_it;

      # check to see if it's in the list of domains to skip
      while (my ($regexp, $simple) = each (%{$skip_domains})) {
	if ($uri_host =~ /^$regexp$/) {	# both already lc
	  dbg("rules: hostname: $uri_host matches skip pattern: $simple");
	  $check_it = 0;
#	  last;
	}
      }
      next unless $check_it;

      # lowercase the domain part
      # (to avoid duplicate lookups of the same differently cased domain)
      if ($uri =~ /^(.{8,}?)(\/.*)$/) {
	$uri = lc($1) . $2;
      } else {
	$uri = lc($uri) . "/";
      }

      # chop off any VISIBLE(!) query string to avoid user id keying
      # this isn't sure fire though... a little ModRewrite voodoo and you're
      # screwed
      $uri =~ s/\?.+$//;

      # add a trailing slash to avoid 301 MOVED when querying a directory
      if ($uri !~ /(?:\/|\.[^\/]{2,4})$/) {
	$uri .= "/";
	dbg("rules: added trailing slash to: $uri");
      }

      $wanted_uris{$uri} = 1;
    }
  }

  return %wanted_uris;
}


1;