# -*-Perl-*-
################################################################
###
### Grep.pm
###
### Author: Internet Message Group
### Created: Nov 03, 1997
### Revised: Jul 4, 2004
###
my $PM_VERSION = "IM::Grep.pm version 20031028(IM146)";
package IM::Grep;
require 5.003;
require Exporter;
use IM::Config;
use IM::Util;
use IM::Folder qw(message_list message_range);
use IM::Japanese;
use IM::EncDec qw(mime_decode_string);
use integer;
use strict;
use vars qw(@ISA @EXPORT %MESSAGE_ID_HASH);
@ISA = qw(Exporter);
@EXPORT = qw(parse_expression grep_folder sortuniq);
##
## Environments
##
# regexp for range syntax (sequence not supported)
my $range_element = '(\\d+|first|last|next|prev|new)';
my $range_regexp = "($range_element(-$range_element|:[+-]?\\d+)?|all)";
# end of header in draft message
my $draft_delimiter = "\n----\n";
%MESSAGE_ID_HASH = ();
sub grep_folder($$$@) {
my($folder, $eval_string, $dup_check, @ranges) = @_;
my $folder_dir;
my @src_msgs = ();
my @messages = ();
if ($folder =~ /^\-/) {
im_warn("Newsspool $folder search not supported (ignored)\n");
}
im_debug("Going on $folder range @ranges\n") if &debug('all');
$folder_dir = expand_path($folder);
chdir($folder_dir) or
im_die("unable to change directory to $folder_dir: $!\n");
im_debug("entered $folder_dir\n") if &debug('all');
# collect message numbers
my @filesinfolder = message_list($folder_dir);
foreach (@ranges) {
my @tmp = ();
im_die("illegal range specification: $_\n")
unless /^$range_regexp$/;
im_debug("extract range $_\n") if &debug('all');
if ((@tmp = message_range($folder, $_, @filesinfolder)) eq '') {
im_warn("message $_ out of range\n");
}
push(@src_msgs, @tmp);
}
im_debug("extracted messages \"@src_msgs\"\n") if &debug('all');
@src_msgs = sortuniq(@src_msgs);
im_debug("uniqified messages \"@src_msgs\"\n") if &debug('all');
# dirty quick hack to determine what part is required
# should be implemented better
my(%find) = ('head' => scalar($eval_string =~ /\$head\s*=~/),
'body' => scalar($eval_string =~ /\$body\s*=~/),
'all' => scalar($eval_string =~ /\$all\s*=~/));
my $m;
foreach $m (@src_msgs) {
my($all, $head, $body) = ('', '', '');
local($/);
unless (im_open(\*MES, "< $m")) {
if (! $main::opt_quiet) {
im_warn("message $m not exists: $!\n");
}
next;
}
# read $head anyway
#
$/ = '';
$head = ;
# if the header contains draft-style header delimiter,
# truncate the header and seek to the beginning of body.
my $p = index($head, $draft_delimiter);
if ($p >= 0) {
seek(MES, $p + length($draft_delimiter), 0);
substr($head, $p + 1) = '';
}
if ($find{'head'} || $find{'all'}) {
$head =~ s/\n\s+/ /g; # fix continuation lines
$head = mime_decode_string($head);
}
# read $body if necessary
#
undef $/;
if ($find{'body'}) {
$body = ;
}
# construct $all if necessary
#
if ($find{'all'}) {
$all = $head . ($body ? $body : scalar());
}
close(MES);
if ($eval_string || $dup_check eq 'none') {
if (eval $eval_string) {
push(@messages, $m);
}
} else {
# check dupulicate message-id
$head =~ m/Message-id:\s*<(.*)>/i;
my $msgid = $1;
$head =~ m/Subject:\s*(.*)/i;
my $subject = $1;
if ($dup_check eq "" || $dup_check eq "message-id") {
if ($MESSAGE_ID_HASH{$msgid}++) {
push(@messages, $m);
}
} elsif ($dup_check eq "message-id+subject") {
my $t = join(";", $msgid, $subject);
if ($t ne ";" and $MESSAGE_ID_HASH{$t}++) {
push(@messages, $m);
}
}
}
}
return @messages;
}
##################################################
##
## Parse expression
##
sub EOL { 0; }
sub LITERAL { 1; }
sub SYMBOL { 2; }
sub parse_expression($$) {
my($expr, $casefold) = @_;
my $case_flag = '';
my $expr_string = '';
my $eval_string = '';
$case_flag = 'i' if ($casefold);
# split into tokens
my $STOPCHARS = '(["\']|\\\\(?:.|$)|\s*(?:[!()=]|\&\&?|\|\|?)\s*)';
my $SYMBOLS = '[!()=]|\&\&?|\|\|?';
my @tokens = ();
my($escape, $pos, $len) = (0) x 3;
my($token, $quote) = ('') x 2;
my $str;
LEX:
foreach $str (split($STOPCHARS, $expr)) {
next LEX if ($str eq '');
# process quoted string
if ($quote ne '') {
if ($quote eq $str) {
$quote = '';
$len++;
next LEX;
}
$token .= $str;
$len += length($str);
next LEX;
}
# escaping
if ($str eq '\\') {
parse_die('Unexpected end of line', $expr, $pos + 1);
}
if ($str =~ /\\(.)/) {
$token .= $1;
$len += 2;
next LEX;
}
# quoting
if ($str =~ /^[\'\"]$/) {
$quote = $str;
$len++;
next LEX;
}
if ($str =~ /^\s*($SYMBOLS)\s*$/) {
if ($token ne '') {
push(@tokens, [LITERAL, $token, $pos - $len]);
$token = '';
$len = 0;
}
push(@tokens, [SYMBOL, $1, $pos + index($str, $1)]);
next LEX;
}
$token .= $str;
$len += length($str);
} continue {
$pos += length($str);
} # end of LEX:
# flush remaining literal
if ($token ne '') {
push(@tokens, [LITERAL, $token, $pos - $len]);
$token = '';
}
if ($quote ne '') {
parse_die('Quoting not closed', $expr, $pos);
}
push(@tokens, [EOL, '', $pos]); # end of line
# automaton status:
#
#
# 0: before expression: '('->0, '!'->0, LITERAL->2, EOL->end
# 1: after expression: ')'->0, '|'->0, '&'->0, EOL->end
# 2: after field: '='->3, others ->error
# 3: before pattern: LITERAL->1, fallback to 1
#
my($status, $paren) = (0) x 2;
my($field, $pattern, $string) = ('') x 3;
# my $token;
PARSE:
foreach $token (@tokens) {
# print "$token->[0]:$token->[1]:$status\n";
if ($status == 0) {
if ($token->[0] == LITERAL) {
$status = 2;
$field = $token->[1];
next PARSE;
}
if ($token->[0] == EOL) {
last PARSE;
}
if ($token->[1] eq '(') {
$paren++;
$eval_string .= '(';
next PARSE;
}
if ($token->[1] eq '!') {
$eval_string .= 'not ';
next PARSE;
}
parse_die('Unexpected symbol', $expr, $token->[2]);
}
if ($status == 1) {
if ($token->[0] == LITERAL) {
parse_die('Syntax error', $expr, $token->[2]);
}
if ($token->[0] == EOL) {
last PARSE;
}
if ($token->[1] eq ')') {
if (--$paren < 0) {
parse_die('Unbalanced parenthesis', $expr, $token->[2]);
}
$eval_string .= ')';
$status = 1;
next PARSE;
}
if ($token->[1] =~ /&/) {
$eval_string .= 'and ';
$status = 0;
next PARSE;
}
if ($token->[1] =~ /\|/) {
$eval_string .= 'or ';
$status = 0;
next PARSE;
}
parse_die('Unexpected symbol', $expr, $token->[2]);
}
if ($status == 2) {
if ($token->[0] == SYMBOL and $token->[1] eq '=') {
$status = 3;
next PARSE;
}
parse_die('Missing \'=\'', $expr, $token->[2]);
}
if ($status == 3) {
if ($token->[0] == LITERAL) {
$pattern = $token->[1];
}
$field =~ s/([@\/])/\\$1/g;
$pattern = make_japanese_pattern($pattern);
if ($field eq 'body') {
$pattern = "." unless $pattern;
$string = "\$$field =~ /$pattern/om$case_flag";
} elsif ($field =~ /^(all|head)$/) {
$pattern = "." unless $pattern;
$string = "\$$field =~ /$pattern/om$case_flag";
} elsif ($field ne '') {
$string = "\$head =~ /^$field:.*$pattern/om$case_flag";
} else {
parse_die('Search pattern not specified', $expr, $token->[2]);
}
$status = 1;
$eval_string .= "$string ";
$field = '';
$pattern = '';
$string = '';
if ($token->[0] == LITERAL) {
next PARSE;
}
redo PARSE;
}
} # end of PARSE:
if ($paren > 0) {
parse_die('Parenthesis not closed', $expr, length($expr));
}
# simple check by perl interpreter
my($head, $body, $all) = ('') x 3;
eval "$eval_string";
if ($@) {
if ($main::opt_quiet) {
exit $EXIT_ERROR;
}
if ($main::opt_verbose) {
im_die("something wrong with the expression:\n$@");
}
im_die("something wrong with the expression\n");
}
# print "$eval_string\n"; exit 0;
return $eval_string;
}
sub parse_die($$$) {
my($die, $expr, $pos) = @_;
if (!$main::opt_quiet and !$main::opt_verbose) {
im_die("$die in the expression\n");
}
if (!$main::opt_quiet and $main::opt_verbose) {
im_die("$die\n$expr\n" . (" " x $pos) . "^\n");
}
exit $EXIT_ERROR;
}
##################################################
##
## sort and uniqify a list
##
sub sortuniq(@) {
my(@target) = @_;
my(%tmp);
@tmp{@target} = (undef) x @target;
return(sort {$a <=> $b} keys %tmp);
}
##################################################
##
## multi-line and Japanese string search
##
my @in = ('\e\$\@', '\e\$B');
my @out = ('\e\(J', '\e\(B');
my $in = join('|', (@in));
my $out = join('|', (@out));
my $shiftcode = '(' . join('|', @in, @out) . ')';
my $chargap = '(' . join('|', @in, @out, '\s'). ')*';
sub make_japanese_pattern {
my $pat = shift;
my $result = '';
my $jis = 0;
#
# If the parameter contains EUC or SJIS string, convert it to
# ISO-2022-JP code. Probably this should be done in imgrep with
# user specified original code rather than expecting it.
#
if ($pat =~ /[\201-\376]/) {
$pat = IM::Japanese::conv_iso2022jp($pat, 'NoHankana');
}
for (split(/$shiftcode/, $pat)) {
if (/$in/o) {
$jis = 1;
$result .= $chargap if $result;
}
elsif (/$out/o) {
$jis = 0;
}
elsif ($jis) {
$result .= join($chargap, map(quotemeta, m/../g));
}
else {
#
# Replace space characters by \s*. This enables to find
# several word sequence across line boundary.
#
s/(.)/$1 =~ m@\s@ ? '\\s*' : quotemeta($1)/eg;
$result .= $_;
}
}
length($result) ? $result : undef;
}
1;
__END__
=head1 NAME
IM::Grep - grep mail/news folder
=head1 SYNOPSIS
use IM::Grep;
$eval_string = &parse_expression($expression, $casefold);
@message_number_array = &grep_folder($folder_dir, $eval_string, @ranges);
=head1 DESCRIPTION
The I module enumerates mail/news messages matched given patterns.
This modules is provided by IM (Internet Message).
=head1 COPYRIGHT
IM (Internet Message) is copyrighted by IM developing team.
You can redistribute it and/or modify it under the modified BSD
license. See the copyright file for more details.
=cut
### Copyright (C) 1997, 1998, 1999 IM developing team
### All rights reserved.
###
### Redistribution and use in source and binary forms, with or without
### modification, are permitted provided that the following conditions
### are met:
###
### 1. Redistributions of source code must retain the above copyright
### notice, this list of conditions and the following disclaimer.
### 2. Redistributions in binary form must reproduce the above copyright
### notice, this list of conditions and the following disclaimer in the
### documentation and/or other materials provided with the distribution.
### 3. Neither the name of the team nor the names of its contributors
### may be used to endorse or promote products derived from this software
### without specific prior written permission.
###
### THIS SOFTWARE IS PROVIDED BY THE TEAM AND CONTRIBUTORS ``AS IS'' AND
### ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
### IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
### PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE TEAM OR CONTRIBUTORS BE
### LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
### CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
### SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
### BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
### WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
### OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
### IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.