# FuzzyOcr plugin, version 3.4 # # written by Christian Holler (decoder_at_own-hero_dot_net) # and Jorge Valdes (jorge_at_joval_dot_info) package FuzzyOcr; use strict; use warnings; use Mail::SpamAssassin; use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Util; use Mail::SpamAssassin::Timeout; use Mail::SpamAssassin::Plugin; use Time::HiRes qw( gettimeofday tv_interval ); use String::Approx 'adistr'; use FileHandle; # added rob@egressive.com 20070603 use File::Temp qw/ tempfile tempdir /; use Fcntl ':flock'; use POSIX; use lib qw(/etc/mail/spamassassin); # Allow placing of FuzzyOcr in siteconfigdir use FuzzyOcr::Logging qw(debuglog errorlog warnlog infolog); use FuzzyOcr::Config qw(kill_pid get_tmpdir set_tmpdir get_all_tmpdirs get_pms save_pms get_timeout get_mysql_ddb get_scansets get_wordlist set_config get_config parse_config finish_parsing_end read_words); use FuzzyOcr::Hashing qw(check_image_hash_db add_image_hash_db calc_image_hash); use FuzzyOcr::Deanimate qw(deanimate); use FuzzyOcr::Scoring qw(wrong_ctype wrong_extension corrupt_img known_img_hash); use FuzzyOcr::Misc qw(max removedir removedirs save_execute); our @ISA = qw(Mail::SpamAssassin::Plugin); # constructor: register the eval rule sub new { my ( $class, $mailsa ) = @_; $class = ref($class) || $class; my $self = $class->SUPER::new($mailsa); bless( $self, $class ); $self->register_eval_rule("fuzzyocr_check"); $self->register_eval_rule("dummy_check"); $self->set_config($mailsa->{conf}); return $self; } sub dummy_check { return 0; } sub fuzzyocr_check { my ( $self, $pms ) = @_; my $conf = get_config(); save_pms($pms); my $end; my $begin = [gettimeofday]; if ($conf->{focr_global_timeout}) { my $t = get_timeout(); debuglog("Global Timeout set at ".$conf->{focr_timeout}." sec."); $t->run(sub { $end = fuzzyocr_do( $self, $conf, $pms ); }); if ($t->timed_out()) { infolog("Scan timed out after $conf->{focr_timeout} seconds."); infolog("Killing possibly running pid..."); my ($ret, $pid) = kill_pid(); if ($ret > 0) { infolog("Successfully killed PID $pid"); } elsif ($ret < 0) { infolog("No processes left... exiting"); } else { infolog("Failed to kill PID $pid, stale process!"); } infolog("Removing possibly leftover tempdirs..."); removedirs(get_all_tmpdirs()); return 0; } } else { $end = fuzzyocr_do( $self, $conf, $pms ); } debuglog("Processed in ". sprintf("%.6f",tv_interval($begin, [gettimeofday])) ." sec."); return $end; } sub fuzzyocr_do { my ( $self, $conf, $pms ) = @_; my $internal_score = 0; my $current_score = $pms->get_score(); my $score = $conf->{focr_autodisable_score} || 100; if ( $current_score > $score ) { infolog("Scan canceled, message has already more than $score points ($current_score)."); return 0; } my $nscore = $conf->{focr_autodisable_negative_score} || -100; if ( $current_score < $nscore ) { infolog("Scan canceled, message has less than $nscore points ($current_score)."); return 0; } my $imgdir; my %imgfiles = (); my @found = (); my @hashes = (); my $cnt = 0; my $imgerr = 0; my $main = $self->{main}; debuglog("Starting FuzzyOcr..."); #Show PMS info if asked to if ($conf->{focr_log_pmsinfo}) { my $msgid = $pms->get('Message-Id') ? $pms->get('Message-Id') : ""; my $from = $pms->get('From') ? $pms->get('From') : ""; my $to = $pms->get('To') ? $pms->get('To') : ""; chomp($from, $to, $msgid); infolog("Processing Message with ID \"$msgid\" ($from -> $to)"); } foreach my $p ( $pms->{msg}->find_parts(qr(^image\b)i), $pms->{msg}->find_parts(qr(Application/Octet-Stream)i) ) { my $ctype = $p->{'type'}; my $fname = $p->{'name'} || 'unknown'; if (($fname eq 'unknown') and (defined $p->{'headers'}->{'content-id'}) ){ $fname = join('',@{$p->{'headers'}->{'content-id'}}); $fname =~ s/[<>]//g; $fname =~ tr/\@\$\%\&/_/s; } my $filename = $fname; $filename =~ tr{a-zA-Z0-9\-.}{_}cs; debuglog("fname: \"$fname\" => \"$filename\""); my $pdata = $p->decode(); my $pdatalen = length($pdata); my $w = 0; my $h = 0; if ( substr($pdata,0,3) eq "\x47\x49\x46" ) { ## GIF File $imgfiles{$filename}{ftype} = 1; ($w,$h) = unpack("vv",substr($pdata,6,4)); infolog("GIF: [${h}x${w}] $filename ($pdatalen)"); $imgfiles{$filename}{width} = $w; $imgfiles{$filename}{height} = $h; } elsif ( substr($pdata,0,2) eq "\xff\xd8" ) { ## JPEG File my @Markers = (0xC0,0xC1,0xC2,0xC3,0xC5,0xC6,0xC7,0xC9,0xCA,0xCB,0xCD,0xCE,0xCF); my $pos = 2; while ($pos < $pdatalen) { my ($b,$m) = unpack("CC",substr($pdata,$pos,2)); $pos += 2; if ($b != 0xff) { infolog("Invalid JPEG image"); $pos = $pdatalen + 1; last; } my $skip = 0; foreach my $mm (@Markers) { if ($mm == $m) { $skip++; last; } } last if ($skip); $pos += unpack("n",substr($pdata,$pos,2)); } if ($pos > $pdatalen) { errorlog("Cannot find image dimensions"); } else { ($h,$w) = unpack("nn",substr($pdata,$pos+3,4)); infolog("JPEG: [${h}x${w}] $filename ($pdatalen)"); $imgfiles{$filename}{ftype} = 2; $imgfiles{$filename}{height} = $h; $imgfiles{$filename}{width} = $w; } } elsif ( substr($pdata,0,4) eq "\x89\x50\x4e\x47" ) { # PNG File ($w,$h) = unpack("NN",substr($pdata,16,8)); $imgfiles{$filename}{ftype} = 3; $imgfiles{$filename}{width} = $w; $imgfiles{$filename}{height} = $h; infolog("PNG: [${h}x${w}] $filename ($pdatalen)"); } elsif ( substr($pdata,0,2) eq "BM" ) { ## BMP File ($w,$h) = unpack("VV",substr($pdata,18,8)); $imgfiles{$filename}{ftype} = 4; $imgfiles{$filename}{width} = $w; $imgfiles{$filename}{height} = $h; infolog("BMP: [${h}x${w}] $filename ($pdatalen)"); } elsif ( ## TIFF File (substr($pdata,0,4) eq "\x4d\x4d\x00\x2a") or (substr($pdata,0,4) eq "\x49\x49\x2a\x00") ) { my $worder = (substr($pdata,0,2) eq "\x4d\x4d") ? 0 : 1; my $offset = unpack($worder?"V":"N",substr($pdata,4,4)); my $number = unpack($worder?"v":"n",substr($pdata,$offset,2)) - 1; foreach my $n (0 .. $number) { my $add = 2 + ($n * 12); my ($id,$tag,$cnt,$val) = unpack($worder?"vvVV":"nnNN",substr($pdata,$offset+$add,12)); $h = $val if ($id == 256); $w = $val if ($id == 257); last if ($h != 0 and $w != 0); } infolog("TIFF: [${h}x${w}] $filename ($pdatalen) ($worder)"); infolog("Cannot determine size of TIFF image, setting to '1x1'") if ($h == 0 and $w == 0); $imgfiles{$filename}{ftype} = 5; $imgfiles{$filename}{width} = $w ? $w : 1; $imgfiles{$filename}{height} = $h ? $h : 1; } #Skip unless we found the right header unless (defined $imgfiles{$filename}{ftype}) { infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); delete $imgfiles{$filename}; next; } #Skip images that cannot contain text if ($imgfiles{$filename}{height} < $conf->{focr_min_height}) { infolog("Skipping image: height < $conf->{focr_min_height}"); delete $imgfiles{$filename}; next; } #Skip images that cannot contain text if ($imgfiles{$filename}{width} < $conf->{focr_min_width}) { infolog("Skipping image: width < $conf->{focr_min_width}"); delete $imgfiles{$filename}; next; } #Skip too big images, screenshots etc if ($imgfiles{$filename}{height} > $conf->{focr_max_height}) { infolog("Skipping image: height > $conf->{focr_max_height}"); delete $imgfiles{$filename}; next; } #Skip too big images, screenshots etc if ($imgfiles{$filename}{width} > $conf->{focr_max_width}) { infolog("Skipping image: width > $conf->{focr_max_width}"); delete $imgfiles{$filename}; next; } #Found Image!! Get a temporary dir to save image # changed rob@egressive.com 20070218 #$imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); $imgdir = File::Temp::mkdtemp('/tmp/focr.XXXX'); unless ($imgdir) { errorlog("Scan canceled, cannot create Image TMPDIR."); return 0; } set_tmpdir($imgdir); #Generete unique filename to store image my $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( $imgdir . "/" . $filename ); my $unique = 0; while (-e $imgfilename) { $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( $imgdir . "/" . chr(65+$unique) . "." . $filename ); $unique++; } #Save important constants $imgfiles{$filename}{fname} = $fname; $imgfiles{$filename}{ctype} = $ctype; $imgfiles{$filename}{fsize} = $pdatalen; $imgfiles{$filename}{fpath} = $imgfilename; #Save Image to disk. unless (open PICT, ">$imgfilename") { errorlog("Cannot write \"$imgfilename\", skipping..."); delete $imgfiles{$filename}; removedir($imgdir); next; } binmode PICT; print PICT $pdata; close PICT; debuglog("Saved: $imgfilename"); #Increment valid image file counter $cnt++; #keep raw email for debugging later my $rawfilename = $imgdir . "/raw.eml"; if (open RAW, ">$rawfilename") { print RAW $pms->{msg}->get_pristine(); close RAW; debuglog("Saved: $rawfilename"); } } if ($cnt == 0) { debuglog("Skipping OCR, no image files found..."); return 0; } infolog("Found: $cnt images"); $cnt = 0; if ($conf->{focr_enable_image_hashing} == 3) { $conf->{focr_mysql_ddb} = get_mysql_ddb(); } # Try to load personal wordlist unless ($conf->{focr_no_homedirs}) { if ($conf->{focr_personal_wordlist} =~ m/^\//) { read_words( $conf->{focr_personal_wordlist} ); } else { my $peruserlist = $main->sed_path($conf->{focr_personal_wordlist}); if ( -r $peruserlist ) { read_words( $peruserlist ); } else { # Only complain if the file exists if ( -e $peruserlist ) { errorlog("Cannot read personal_wordlist: $peruserlist, skipping..."); } } } } my $haserr; foreach my $filename (keys %imgfiles) { my $pic = $imgfiles{$filename}; #infolog("Analyzing file with content-type=\"$$pic{ctype}\""); my @used_scansets = (); my $corrupt = 0; my $suffix = 0; my $generic_ctype = 0; my $digest; my $file = $$pic{fpath}; my $tfile = $file; my $pfile = $file . ".pnm"; my $efile = $file . ".err"; debuglog("pfile => $pfile"); debuglog("efile => $efile"); #Open ERRORLOG $haserr = $Mail::SpamAssassin::Logger::LOG_SA{level} == 3; if ($haserr) { $haserr = open RAWERR, ">$imgdir/raw.err"; debuglog("Errors to: $imgdir/raw.err") if ($haserr>0); } my $mimetype = $$pic{ctype}; if($mimetype =~ m'application/octet-stream'i) { $generic_ctype = 1; } if($$pic{fname} =~ /\.([\w-]+)$/) { $suffix = $1; } if ($suffix) { debuglog("File has Content-Type \"$mimetype\" and File Extension \"$suffix\""); } else { debuglog("File has Content-Type \"$mimetype\" and no File Extension"); } if ( $$pic{ftype} == 1 ) { infolog("Found GIF header name=\"$$pic{fname}\""); if ($conf->{focr_skip_gif}) { infolog("Skipping image check"); next; } if (defined($conf->{focr_max_size_gif}) and ($$pic{fsize} > $conf->{focr_max_size_gif})) { infolog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); next; } if ( ($$pic{ctype} !~ /gif/i) and not $generic_ctype) { wrong_ctype( "GIF", $$pic{ctype} ); $internal_score += $conf->{'focr_wrongctype_score'}; } if ( $suffix and $suffix !~ /gif/i) { wrong_extension( "GIF", $suffix); $internal_score += $conf->{'focr_wrongext_score'}; } my $interlaced_gif = 0; my $image_count = 0; foreach my $a (qw/gifsicle giftext giffix gifinter giftopnm/) { unless (defined $conf->{"focr_bin_$a"}) { errorlog("Cannot exec $a, skipping image"); next; } } my @stderr_data; my ($retcode, @stdout_data) = save_execute( "$conf->{focr_bin_giftext} $file", undef, ">$imgdir/giftext.info", ">>$imgdir/giftext.err", 1); if ($retcode<0) { # only care if we timed out chomp $retcode; errorlog("$conf->{focr_bin_giftext} Timed out [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } foreach (@stdout_data) { unless ($interlaced_gif) { if ( $_ =~ /Image is Interlaced/i ) { $interlaced_gif = 1; } } if ( $_ =~ /^Image #/ ) { $image_count++; } } if ($interlaced_gif or ($image_count > 1)) { infolog("Image is interlaced or animated..."); } else { infolog("Image is single non-interlaced..."); $tfile .= "-fixed.gif"; printf RAWERR "## $conf->{focr_bin_giffix} $file >$tfile 2>>$efile\n" if ($haserr>0); $retcode = save_execute("$conf->{focr_bin_giffix} $file", undef, ">$tfile", ">>$efile"); if ($retcode<0) { # only care if we timed out chomp $retcode; errorlog("$conf->{focr_bin_giffix}: Timed out [$retcode], skipping..."); printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } if (open ERR, $efile) { @stderr_data = ; close ERR; foreach (@stderr_data) { if ( $_ =~ /GIF-LIB error/i ) { $corrupt = $_; last; } } } } my $fixedsize = (stat($tfile))[7]; if (defined($conf->{focr_max_size_gif}) and ($fixedsize > $conf->{focr_max_size_gif})) { infolog("Fixed GIF file size ($fixedsize) exceeds maximum file size for this format, skipping..."); next; } if ($corrupt) { if ($interlaced_gif or ($image_count > 1)) { infolog("Skipping corrupted interlaced image..."); corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); $internal_score += $conf->{focr_corrupt_unfixable_score}; next; } if (-z $tfile) { infolog("Uncorrectable corruption detected, skipping non-interlaced image..."); corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); $internal_score += $conf->{focr_corrupt_unfixable_score}; next; } infolog("Image is corrupt, but seems fixable, continuing..."); corrupt_img($conf->{focr_corrupt_score}, $corrupt); $internal_score += $conf->{focr_corrupt_score}; } if ($image_count > 1) { infolog("File contains <$image_count> images, deanimating..."); $tfile = deanimate($tfile); } if ($interlaced_gif) { infolog("Processing interlaced_gif $tfile..."); my $cfile = $tfile; if ($tfile =~ m/\.gif$/i) { $tfile =~ s/\.gif$/-fixed.gif/i; } else { $tfile .= ".gif"; } printf RAWERR qq(## $conf->{focr_bin_gifinter} $cfile >$tfile 2>>$efile\n) if ($haserr>0); $retcode = save_execute("$conf->{focr_bin_gifinter} $cfile", undef, ">$tfile", ">>$efile"); if ($retcode<0) { chomp $retcode; printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); errorlog("$conf->{focr_bin_gifinter}: Timed out [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } elsif ($retcode>0) { chomp $retcode; printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_gifinter}\n" if ($haserr>0); errorlog("$conf->{focr_bin_gifinter}: Returned [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } } printf RAWERR qq(## $conf->{focr_bin_giftopnm} $tfile >$pfile 2>>$efile\n) if ($haserr>0); $retcode = save_execute("$conf->{focr_bin_giftopnm} $tfile", undef, ">$pfile", ">>$efile"); if ($retcode<0) { chomp $retcode; printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); errorlog("$conf->{focr_bin_giftopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } elsif ($retcode>0) { chomp $retcode; printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_giftopnm}\n" if ($haserr>0); errorlog("$conf->{focr_bin_giftopnm}: Returned [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } } elsif ( $$pic{ftype} == 2 ) { infolog("Found JPEG header name=\"$$pic{fname}\""); if ($conf->{focr_skip_jpeg}) { infolog("Skipping image check"); next; } if (defined($conf->{focr_max_size_jpeg}) and ($$pic{fsize} > $conf->{focr_max_size_jpeg})) { infolog("JPEG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); next; } if ( ($$pic{ctype} !~ /(jpeg|jpg)/i) and not $generic_ctype) { wrong_ctype( "JPEG", $$pic{ctype} ); $internal_score += $conf->{'focr_wrongctype_score'}; } if ( $suffix and $suffix !~ /(jpeg|jpg|jfif)/i) { wrong_extension( "JPEG", $suffix); $internal_score += $conf->{'focr_wrongext_score'}; } foreach my $a (qw/jpegtopnm/) { unless (defined $conf->{"focr_bin_$a"}) { errorlog("Cannot exec $a, skipping image"); next; } } printf RAWERR qq(## $conf->{focr_bin_jpegtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); my $retcode = save_execute("$conf->{focr_bin_jpegtopnm} $file", undef, ">$pfile", ">>$efile"); if ($retcode<0) { chomp $retcode; printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); errorlog("$conf->{focr_bin_jpegtopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } elsif ($retcode>0) { chomp $retcode; printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_jpegtopnm}\n" if ($haserr>0); errorlog("$conf->{focr_bin_jpegtopnm}: Returned [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } } elsif ( $$pic{ftype} == 3 ) { infolog("Found PNG header name=\"$$pic{fname}\""); if ($conf->{focr_skip_png}) { infolog("Skipping image check"); next; } if (defined($conf->{focr_max_size_png}) and ($$pic{fsize} > $conf->{focr__max_size_png})) { infolog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); next; } if ( ($$pic{ctype} !~ /png/i) and not $generic_ctype) { wrong_ctype( "PNG", $$pic{ctype} ); $internal_score += $conf->{'focr_wrongctype_score'}; } if ( $suffix and $suffix !~ /(png)/i) { wrong_extension( "PNG", $suffix); $internal_score += $conf->{'focr_wrongext_score'}; } foreach my $a (qw/pngtopnm/) { unless (defined $conf->{"focr_bin_$a"}) { errorlog("Cannot exec $a, skipping image"); next; } } printf RAWERR qq(## $conf->{focr_bin_pngtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); my $retcode = save_execute("$conf->{focr_bin_pngtopnm} $file", undef, ">$pfile", ">>$efile"); if ($retcode<0) { chomp $retcode; printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); errorlog("$conf->{focr_bin_pngtopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } elsif ($retcode>0) { chomp $retcode; printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pngtopnm}\n" if ($haserr>0); errorlog("$conf->{focr_bin_pngtopnm}: Returned [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } } elsif ( $$pic{ftype} == 4 ) { infolog("Found BMP header name=\"$$pic{fname}\""); if ($conf->{focr_skip_bmp}) { infolog("Skipping image check"); next; } if (defined($conf->{focr_max_size_bmp}) and ($$pic{fsize} > $conf->{focr_max_size_bmp})) { infolog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); next; } if ( ($$pic{ctype} !~ /bmp/i) and not $generic_ctype) { wrong_ctype( "BMP", $$pic{ctype} ); $internal_score += $conf->{'focr_wrongctype_score'}; } if ( $suffix and $suffix !~ /(bmp)/i) { wrong_extension( "BMP", $suffix); $internal_score += $conf->{'focr_wrongext_score'}; } foreach my $a (qw/bmptopnm/) { unless (defined $conf->{"focr_bin_$a"}) { errorlog("Cannot exec $a, skipping image"); next; } } printf RAWERR qq(## $conf->{focr_bin_bmptopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); my $retcode = save_execute("$conf->{focr_bin_bmptopnm} $file", undef, ">$pfile", ">>$efile"); if ($retcode<0) { chomp $retcode; printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); errorlog("$conf->{focr_bin_bmptopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } elsif ($retcode>0) { chomp $retcode; printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_bmptopnm}\n" if ($haserr>0); errorlog("$conf->{focr_bin_bmptopnm}: Returned [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } } elsif ( $$pic{ftype} == 5 ) { infolog("Found TIFF header name=\"$$pic{fname}\""); if ($conf->{focr_skip_tiff}) { infolog("Skipping image check"); next; } if (defined($conf->{focr_max_size_tiff}) and ($$pic{fsize} > $conf->{focr_max_size_tiff})) { infolog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); next; } if ( ($$pic{ctype} !~ /tif/i) and not $generic_ctype) { wrong_ctype( "TIFF", $$pic{ctype} ); $internal_score += $conf->{'focr_wrongctype_score'}; } if ( $suffix and $suffix !~ /tif/i) { wrong_extension( "TIFF", $suffix); $internal_score += $conf->{'focr_wrongext_score'}; } foreach my $a (qw/tifftopnm/) { unless (defined $conf->{"focr_bin_$a"}) { errorlog("Cannot exec $a, skipping image"); next; } } printf RAWERR qq(## $conf->{focr_bin_tifftopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); my $retcode = save_execute("$conf->{focr_bin_tifftopnm} $file", undef, ">$pfile", ">>$efile"); if ($retcode<0) { chomp $retcode; printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); errorlog("$conf->{focr_bin_tifftopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } elsif ($retcode>0) { chomp $retcode; printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_tifftopnm}\n" if ($haserr>0); errorlog("$conf->{focr_bin_tifftopnm}: Returned [$retcode], skipping..."); ++$imgerr if $conf->{focr_keep_bad_images}>0; next; } } else { errorlog("Image type not recognized, unknown format. Skipping this image..."); next; } if($conf->{focr_enable_image_hashing}) { infolog("Calculating image hash for: $pfile"); ($corrupt, $digest) = calc_image_hash($pfile,$pic); if ($corrupt) { infolog("Error calculating the image hash, skipping hash check..."); } else { my ($score, $dinfo, $whash); $whash = $conf->{focr_enable_image_hashing} == 3 ? $conf->{focr_mysql_hash} : $conf->{focr_db_hash}; ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); if ($score > 0) { known_img_hash($score,$dinfo); infolog("Message is SPAM. $dinfo") if ($conf->{focr_enable_image_hashing} < 3); removedirs(get_all_tmpdirs()); return 0; } $whash = $conf->{focr_enable_image_hashing} == 3 ? $conf->{focr_mysql_safe} : $conf->{focr_db_safe}; ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); if ($score > 0) { infolog("Image in KNOWN_GOOD. Skipping OCR checks..."); next; } } if ($digest eq '') { infolog("Empty Hash, skipping..."); next; } } else { infolog("Image hashing disabled in configuration, skipping..."); } # Note: $current_score is here the score that the message had at the beginning # and $score is the autodisable_score defined in the config # $internal_score describes the score that the message got by FuzzyOcr so far. if ($internal_score + $current_score > $score) { my $total = $internal_score + $current_score; warnlog("FuzzyOcr stopped, message got $internal_score points by other FuzzyOcr tests ($total>$score)."); #infolog("OCR canceled, message got already more than $score points ($total) by other FuzzyOcr tests."); return 0; } my @ocr_results = (); my $scansets = get_scansets(); my $newlist = ''; foreach my $s (@$scansets) { $newlist .= ' ' . $s->{label} . '(' . $s->{hit_counter} . ')'; } infolog("Scanset Order:$newlist"); my $mcnt = 0; my $modus = 0; my $modus_match = 0; my $wref = get_wordlist(); my %words = %$wref; foreach my $scanset (@$scansets) { my $scanlabel = $scanset->{label}; my $scancmd = $scanset->{command}; if ($scancmd =~ m/^\$/) { warnlog("Skipping $scanlabel, invalid command '$scancmd'"); next; } if (($scancmd =~ m/ocrad/) and ($$pic{width} < 16 or $$pic{height} < 16)) { warnlog("Skipping $scanlabel, image too small"); next; } my $cmcnt = 0; my @cfound; if (defined $scanset->{args}) { $scancmd .= ' ' . $scanset->{args}; } printf RAWERR qq(## $scancmd\n) if ($haserr>0); my ($retcode, @result) = $scanset->run($pfile); if ($retcode<0) { if ($retcode == -1) { printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); errorlog("Timeout[$scanlabel]: \"$scancmd\" took more than $conf->{focr_timeout} sec."); } elsif ($retcode == -2) { printf RAWERR qq(Cannot exec[$scanlabel]: $scancmd\n) if ($haserr>0); errorlog("Cannot execute($scanlabel): \"$scancmd\""); } else { printf RAWERR qq(Unknown error <$retcode>: $scancmd\n) if ($haserr>0); errorlog("Unknown error: [$retcode]..."); } infolog("Skipping scanset, trying next..."); next; } elsif ($retcode>0) { chomp $retcode; my $errstr = "Return code: $retcode, Error: "; $errstr .= join( '', @result ); warnlog("Errors in Scanset \"$scanlabel\""); warnlog($errstr); warnlog("Skipping scanset because of errors, trying next..."); printf RAWERR qq($errstr\n) if ($haserr>0); next; } debuglog("ocrdata=>>".join("",@result)."<<=end"); foreach $modus (0 .. 1) { foreach my $ww (keys %words) { my $w = lc $ww; $w =~ s/[^a-z0-9 ]//g; if ($modus) { $w =~ s/ //g; } if ($conf->{focr_strip_numbers}) { $w =~ s/[0-9]//g; } my $wcnt = 0; foreach (@result) { $_ = lc; if ($modus) { s/ //g; } if ($conf->{focr_strip_numbers}) { tr/!;|(0815/iiicoals/; s/[0-9]//g; } else { tr/!;|(/iiic/; } s/[^a-z0-9 ]//g; my $matched = abs(adistr( $w, $_ )); if ( $matched < $words{$ww} ) { $wcnt++; infolog( "Scanset \"$scanlabel\" found word \"$w\" with fuzz of " . sprintf("%0.4f",$matched) . "\nline: \"$_\"" ); if ($conf->{focr_unique_matches}) { last; } } } $cmcnt += $wcnt; if ( ( $conf->{focr_verbose} > 0 ) and ($wcnt) ) { push( @cfound, "\"$w\" in $wcnt lines" ); } } $mcnt = max($mcnt, $cmcnt); if ($mcnt == $cmcnt) { @found = @cfound; } if ((not $modus) and ($cmcnt >= $conf->{focr_counts_required})) { if ($mcnt == $cmcnt) { $modus_match = 0; } debuglog("Enough OCR Hits without space stripping, skipping second matching pass..."); last; } elsif (not $modus) { debuglog("Not enough OCR Hits without space stripping, doing second matching pass..."); if ($mcnt == $cmcnt) { $modus_match = 1; } } } if ($mcnt >= $conf->{focr_counts_required} and $conf->{focr_minimal_scanset}) { infolog("Scanset \"$scanlabel\" generates enough hits ($mcnt), skipping further scansets..."); if ($conf->{focr_autosort_scanset}) { foreach my $s (@$scansets) { if ($s->{label} eq $scanlabel) { if ($s->{hit_counter} < $conf->{focr_autosort_buffer}) { $s->{hit_counter} = $s->{hit_counter} + 1; } } else { if ($s->{hit_counter} > 0) { $s->{hit_counter} = $s->{hit_counter} - 1; } } } } last; } } if ($conf->{focr_enable_image_hashing}) { my $info = join('::',$mcnt,$$pic{fname},$$pic{ctype},$$pic{ftype},$digest); push(@hashes, $info); } # Normal match or match without spaces? if ($modus_match) { $cnt += $mcnt; } else { $cnt += $conf->{focr_twopass_scoring_factor} * $mcnt; } } close RAWERR if ($haserr>0); if ($cnt == 0) { if ($conf->{focr_enable_image_hashing} > 1 and @hashes) { infolog("Message is ham, saving..."); foreach my $h (@hashes) { my ($mcnt,$fname,$ctype,$ftype,$digest) = split('::',$h,5); next if $mcnt; my $whash = $conf->{focr_enable_image_hashing} == 3 ? $conf->{focr_mysql_safe} : $conf->{focr_db_safe}; add_image_hash_db($digest,0,$whash,$fname,$ctype,$ftype); } } } else { my $score = '0.000'; my $debuginfo = ( "Words found:\n" . join( "\n", @found ) . "\n($cnt word occurrences found)" ); if ($cnt >= $conf->{focr_counts_required}) { $score = sprintf "%0.3f", $conf->{focr_base_score} + (( $cnt - $conf->{focr_counts_required} ) * $conf->{focr_add_score} ); infolog("Message is spam, score = $score"); } else { $score = sprintf("%0.3f", $conf->{focr_add_score} * $cnt) if $conf->{focr_score_ham}; infolog("Message is ham, score = $score"); } if ($conf->{focr_enable_image_hashing} and $conf->{focr_hashing_learn_scanned} and $score > 0) { foreach my $h (@hashes) { my ($mcnt,$fname,$ctype,$ftype,$digest) = split('::',$h,5); next unless $mcnt; my $whash = $conf->{focr_enable_image_hashing} == 3 ? $conf->{focr_mysql_hash} : $conf->{focr_db_hash}; add_image_hash_db($digest,$score,$whash,$fname,$ctype,$ftype,$debuginfo); } } if ( $conf->{focr_verbose} > 0 and $conf->{focr_verbose} < 3 ) { infolog($debuginfo) unless ($conf->{focr_enable_image_hashing} == 3); } for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR"} = $score; } $pms->_handle_hit( "FUZZY_OCR", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR} . "\n$debuginfo" ); } if ($imgerr == 0 and $conf->{focr_keep_bad_images}<2) { removedirs(get_all_tmpdirs()); } if ($conf->{focr_enable_image_hashing} == 3) { if (defined $conf->{focr_mysql_ddb}) { $conf->{focr_mysql_ddb}->disconnect; } } debuglog("FuzzyOcr ending successfully..."); return 0; } 1; #vim: et ts=4 sw=4