newscan 2.0 - a Perl Network News Scanner (Part 2 of 4) 
Author Message
 newscan 2.0 - a Perl Network News Scanner (Part 2 of 4)


Archive-name: newscan/part02

#!/bin/sh
# this is newscan.shar.02 (part 2 of newscan)
# do not concatenate these parts, unpack them in order with /bin/sh
# file newscan continued
#
touch -am 1231235999 $$.touch >/dev/null 2>&1
if test ! -f 1231235999 && test -f $$.touch; then
  shar_touch=touch
else
  shar_touch=:
  echo 'WARNING: not restoring timestamps'
fi
rm -f 1231235999 $$.touch
#
if test ! -r _sharseq.tmp; then
  echo 'Please unpack part 1 first!'
  exit 1
fi
shar_sequence=`cat _sharseq.tmp`
if test "$shar_sequence" != 2; then
  echo "Please unpack part $shar_sequence next!"
  exit 1
fi
if test ! -f _sharnew.tmp; then
  echo 'x - still skipping newscan'
else
  echo 'x - continuing file newscan'
  sed 's/^X//' << 'SHAR_EOF' >> 'newscan' &&
open(MBOX,">>$mbox");
#
# connect to the NNTP server
# $port = 119 unless $port;
#
$them = $ENV{'NNTPSERVER'} unless $them;
# use NNTPSERVER environment variable
X
X
$AF_INET = &AF_INET;                # &AF_INET defined in sys/socket.ph
$SOCK_STREAM = &SOCK_STREAM;        # = 2 for Irix 5 ( 1 for Irix 4 )
X
$sockaddr = 'S n a4 x8';
X
$hostname = $ENV{'LOCALHOST'} unless $hostname; # get name of local host
# from environment variable
chop($hostname = `hostname`) unless $hostname;  # get name of local host
#from Unix hostname utility
$them = $hostname unless $them; # try local machine as NNTP Server
X
# translate protocol name to associated number
($name,                         # human-readable name of protocol
X $aliases,                     # human-readable aliases of protocol
X $proto) = getprotobyname('tcp'); # protocol number ( 6 for tcp )
X
# translates service (port) name to corresponding number
($name,                         # name of service (NNTP in this case)
X $aliases,                     # aliases for service
X $port,                                # port number of service (119 in this case)
X $proto) = getservbyname('nntp' ,'tcp')        # protocol (TCP in this case)
X       unless $port =~ /^\d+$/;
X
# translates network hostname to corresponding number
($name,$aliases,$type,$len,$thisaddr) =
X       gethostbyname($hostname);
X
$this_numeric = unpack('N', $thisaddr);
if( !defined $name ) # something went wrong in gethostbyname
{                               # try fully qualified domain name (FQDN)
X                               # if $hostname from Unix hostname command
X    chop($local_host = `hostname`);
X    if($hostname eq $local_host)
X    {
X       chop($domain = `domainname`);
X       $fqdn = $local_host . '.' . $domain;
# try Fully Qualified Domain Name
X       ($name,            # Domain name of host
X        $aliases,         # aliases of the host
X        $type,            # type of address AF_INET = 2 (IP Address Type)
X        $len,             # length of address in bytes (4 for IP)
X        $thisaddr) = gethostbyname($fqdn);
X    }

Quote:
}

elsif( $hostname =~ /\d+/ && $this_numeric == $hostname )
{                             # if hostname is strict number gethostbyname
X                             # may return that number as the IP address!
X    chop($local_host = `hostname`);
X
X    if($hostname eq $local_host) # hostname from hostname command
X    {
X       chop($domain = `domainname`);
X       $fqdn = $local_host . '.' . $domain;
# try fully qualified domain name
X       ($name, $aliases, $type, $len, $thisaddr) =
X           gethostbyname($fqdn);
X    }
Quote:
}

X
X
X
# translate network hostname to corresponding number
if($them =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/)
{
X       $thataddr = pack('C4',$1,$2,$3,$4);
Quote:
}

elsif($them =~ /(\w+)(\.\w+)*/)
{
X       ($name,$aliases,$type,$len,$thataddr) = gethostbyname($them);
Quote:
}

else
{
X       die "Fatal error: NNTP host not specified in proper format!\n";
Quote:
}

$this = pack($sockaddr, $AF_INET, 0, $thisaddr);
$that = pack($sockaddr, $AF_INET, $port, $thataddr);
X
# Make the socket a filehandle
X
if(socket(S, $AF_INET, $SOCK_STREAM, $proto))
{
#       print "socket ok\n";
Quote:
}

else
{
X       die "Fatal Error: socket failed ", $1,"\n";
Quote:
}

X
# give the socket an address
if(bind(S, $this))
{
#       print "bind ok\n";
Quote:
}

else
{
X    ($ip1, $ip2, $ip3, $ip4) = unpack('C4', $thisaddr);
X    $localaddr = $ip1 . '.' . $ip2 . '.' . $ip3 . '.' . $ip4;
X    $this_numeric = unpack('N', $thisaddr);
X
X        if ( $hostname =~ /^\d+$/ )
X       {
X           print <<"EofError";
newscan: FATAL ERROR!!  
newscan: Bind to local host $hostname ( IP $localaddr ) failed!
newscan:
newscan: Your official host name ( $hostname ) is a strict number.
newscan: This can cause problems with some implementations of the sockets
newscan: networking software.  The software may interpret the numeric name
newscan: as an Internet address instead of looking up the Internet address
newscan: in the host table or domain name system.
newscan:
newscan: newscan thinks your local host's Internet address is $localaddr
newscan: As a 32 bit number, this address is $this_numeric
newscan:
newscan: If $this_numeric = $hostname ( your official host name )
newscan:    the host table lookup almost certainly failed!!
newscan:
newscan: Quick Fix (works on some systems)
newscan:
newscan:   If your host has a valid alias host name that
nwescan: is not a strict number, e.g. $hostname.megacorp.com
newscan: specify this alias to newscan via the LOCALHOST environment
newscan: variable or the -H <alias host name> command line argument:
newscan:
newscan:     newscan -H $hostname.megacorp.com
newscan:
EofError
X
X    if ( $opt_a ) # sound effects only if requested by user
{
X    &PlayFile($audio_file); # play an audio file in Sun/Next format
Quote:
}

X    die "newscan: Aborting newscan - bye now :-( \n" ;
X       }
X       else
X       {
print <<"EofError";
newscan: ERROR!!  Bind to local host $hostname (IP $localaddr ) failed!
newscan: You may want to check that TCP/IP is up and configured correctly
newscan: on your system.  For example, if you run SLIP as the author
newscan: of newscan does, you may want to verify that SLIP is running.
newscan:
EofError
X
X    if ($opt_a)
{                               # play audio file
X    &PlayFile($audio_file);
Quote:
}                               #

die "newscan: Aborting newscan - bye now :-( \n";
Quote:
}
}

X
# Call up the server
if(connect(S,$that))
{
#       print "Connect to $them ok\n";
Quote:
}

else
{
X die "Fatal Error: Apparently, can't connect to $them tcp/ip port $port.",
X "Error: ", $1, "\n";              # list to avoid over 80 char lines
Quote:
}

X
#  Set socket to be command buffered
select(S); $| = 1; select(STDOUT);
# loop over groups to search
$_ = <S>;  # read confirmation of connection message from NNTP server
X
($stat, $rest) = split(/ /,$_,2);       # split connection message
X
if( $stat == 200 || $stat == 201 )
{
X    # on initial connection, NNTP server will return
X    # 200 server ready - posting allowed
X    # 201 server ready - no posting allowed
X    # otherwise there has been a problem
Quote:
}

else
{
X die "newscan: Abort! NNTP Server $them refused connection with message: ",
X  "$stat $rest \n";
Quote:
}

#
# After connection, send MODE READER command.  MODE READER will put
# INN news servers into mode where they recognize RFC 977 NNTP commands.
#
print S "MODE READER\n";
$_ = <S>; # read reply from server (error if nntpd, 200 xxx if inn)
X
($stat, $rest) = split(/ /, $_, 2); # split response to MODE READER command
X
if( $stat == 200)
{
X    if(!$opt_X && !$opt_s)
X    {
X       print "Information: News Server Supports MODE READER command!\n";
X    }
Quote:
}

elsif ( $stat == 500 )
{
X    if(!$opt_X && !$opt_s)
X    {
X       print "Information: News Server Does Not Use MODE READER command.\n";
X    }
Quote:
}

else
{
X  die "newscan Fatal Error: Unexpected response to MODE READER command\n";
Quote:
}

X
# implement Unix glob-style pattern matching for news group selection
#  * asterisk matches 0 or more characters (* is .* in regular expressions)
#  ? question mark matches any 1 character (? is . in regular expressions)
#
X
$k = 0;

{
X    if( ($Group =~ /([^\\]|^)\*/) || ($Group =~ /([^\\]|^)\?/) )
X    {
X       $expandGroup{$Group} = $k; # offset into selectedGroups
X    }
X    $k++;
Quote:
}

X
if(keys expandGroup)
{
X    $total_groups = 0;
X    $oldTime = time();
X    print S "LIST\n";                # retrieve list of valid groups
X                                # from NNTP Server
X    $flush_save = $|;
X    $| = 1; print "L"; $| = $flush_save;


X
X    $_ = <S>;                    # read first line back
X    while(! /^\.[^\.].*$/)
X    {
X       $total_groups++;
X       chop; chop;             # remove trailing \r\n
X       $line = $_ . "\n";    # add trailing newline

X
X       $flush_save = $|;
X       $| = 1;
X       if (! $opt_X )
X       {
X           print ".";
X       }
X       else # running as server for X GUI front end
X       {
X           if ( time() - $oldTime > 1 )
X           {
X               print "108 $total_groups $#validGroups $line\n";
X               $oldTime = time();
X           }
X       }
X       $| = $flush_save;
X       $_ = <S>;         # read next line
X    }
Quote:
}

X

{
X    ($vGroup, $remainder) = split(/ /,$vGroup);
#    $flush_save = $|;
#    $| = 1; print "."; $| = $flush_save;
Quote:
}

X
foreach $Group (keys expandGroup)
{


X    ($GroupMatch = $Group) =~ s/\*/.*/g; # replace asterisk with .*
X
X    $GroupMatch =~ s/\+/\\\+/g;
X    $GroupMatch =~ s/\.([^\*])/\\\.$1/g; # replace . with \.
X    $GroupMatch =~ s/\?/\./g;  
# replace ? with . --- needs to follow . --> \.
X    $GroupMatch = "^"  .  $GroupMatch;
X    $GroupMatch = $GroupMatch .  "\$" ;
X    

X    {
X       if ( $vGroup =~ /$GroupMatch/ ) # match valid group to group
X       {
X           push(subList, $vGroup);
X       }
X    }
X
X    if($#subList < 0)
X    {
X       print STDERR "newscan: Warning $Group doesn't match any Newsgroup ",
"known to News Server!\n"; # says News not NNTP, is clearer.
X    }
X

X    &RangeSplice(*range, $Group, *subList); # won't work????
X    &AssocSplice(*pattern, $Group, *subList);
X    &AssocSplice(*required, $Group, *subList);
X    &AssocSplice(*veto, $Group, *subList);
Quote:
}                               # end loop over groups to expand

X
# remove deselected groups
X

X

{
X    $dGroup =~ s/\*/.*/g;      # change * to .* (Perl regexp version)
X    $dGroup =~ s/\+/\\\+/g;    # change + to \+
X    $dGroup =~ s/\.([^\*])/\\\.$1/g; # change . not followed by * to \.
Quote:
}

X

{
X    $match = $FALSE;

X    {
X       if( $Group =~ /$dGroup/ ) # use =~ to get support for * expansion
X       {
X           $match = $TRUE;
X       }
X    }                          # end loop over deselected groups
X
X    if($match)
X    {                          #
X       delete $range{$Group};
X       delete $pattern{$Group};
X       delete $required{$Group};
X       delete $veto{$Group};
X       undef $Group;   # don't undef $dGroup, may match another $Group
X    }                  #
Quote:
}

X

X
X
# loop over groups that are keys of range array
# this allows different searches for different groups
# %range associative array includes other groups
# such as cross posting groups
X
foreach $group (sort keys %range)
{
# send group command to NNTP server
X       print S "GROUP $group \n";
X       $_ = <S>; # read status reply from NNTP server
#        if ($!)
#           {
#               print "Socket Read Returned Error!\n";
#           }
X       ($status,$gn,$gfirst,$glast,$gname) = split(/ /);
X       if($status == 411)
X       {
X               warn "Warning! Group $group does not exist! Skipping to ",
"next group!\n";
X       }
X       elsif($status == 211) # loop over articles in the group
X       {                      
X           if( $gn > 0 )    # if group is not empty
X           {
X
X             $found_in_group{$group} = 0; # initialize to none found
X             if( ! $opt_s && ! $opt_X )
X               {
X                 $flush_save = $|; # save flush control
X                   $| = 1;             # flush on every print or write
X                     print "G( $group )";
X                 $| = $flush_save; # return to standard method
X               } # indicate doing a newsgroup
X          
X                 if( $opt_X && $opt_X eq "RUN" )
X                   {
X                     print "104 GROUP $group \n";
X                   }
X
X
X             $i = $gfirst;
X             $prevArticle = $gfirst;
X             $do = 1;
X             $article_count = 0;       # start with no articles
X
X           while($do)
X           {
# check if article is in not in excluded range (previously scanned articles)
X                       if(!&InRange($range{$group},$i))
X                       {
# check if article exists
X                               print S "STAT $i\n";
X                               $_ = <S>;  # read reply
X                               ($status,$article,$id,$rest) = split(/ /);
X                               if($status == 223)
X                               {
# article retrieved
X                                   if ( ! $opt_s && ! $opt_X )
X                                   {
X                                       $flush_save = $|;
X                                       $| = 1;
# flush on every print or write
X                                       print "\."; # period for each
# article scanned
X                                       $| = $flush_save;
X                                            
X                                   }
X                                       print S "article\n";
X                                       $_ = <S>;
X                                       ($status,$article,$id,$rest) =
X                                           split(/ /);
X                                       if($status == 220)
X                                       {

# try to free memory

X                                               $_ = <S>;
# read first line of text
X
X                                               while(!/^\.[^\.].*$/)
# loop until encounter lone period at start of line (.^J ends text in NNTP)
X                                               {
X                                                       chop; chop;
# remove trailing CR LF (^M ^J)
X                                                       $line = $_ . "\n";
# add line feed LF ^J to end of line

# add line to text list
X                                                       $_ = <S>;
# read another line
X                                               }
X
X                                              
X               &Collect(*text, *statistics, *pairs, *doPairs) if %statistics;
X
X                                  $match = $FALSE; # start with no match
X                                               if(!&Veto(*text,*veto,*group))
X                                               {
# look for match
# deal with required patterns first
X                                                       $Required = $TRUE;
# start by assuming it matches all required patterns
X                          foreach $search (split("\034",$required{$group}))
X                          {
X                               $blatz = 0;

X                               if($blatz == 0)
X                               {
X                                   $Required = $FALSE;
# does not match pattern $search which is required.
X                               }
X                              
X                           }   #
X                                                      
X               if($Required)
X               {
# at this point $match is FALSE (no match has been found )


X                      || $#search_patterns < 0)
# $#array is FINAL SUBSCRIPT of array (0 for one element array
X                   {
X                       $match = $TRUE;
X                   }
# if there are search patterns from WHERE then check

X                   {
X                       $blatz = 0;

X                       if($blatz)
X                       {
X                           $match = $TRUE;
X                       }
X                   } # close loop over search patterns
X               } # close if Required
X                                                   }
# clear the NewsArticle array
X                                           %NewsArticle = ();
# find any cross references
X                                           if(!&GetXRef(*text,*NewsArticle))
X                                               {
X                                                   $NewsArticle{$group} =
X                                                       $article;
X                                               }
# NewsArticle is an associative array containing the group and article number
# in group for the article ( an article may be posted to multiple groups
X                                       # if match store article
X                                           if($match)
X                                           {
X                                               if(! $opt_s && ! $opt_X )
# not quiet
X                                               {
X                                                   $flush_save = $|;
# save i/o buffering state
X                                                   $| = 1;
# immediate output
X                                                   print 'F';
# indicate found an article
X                                                   $| = $flush_save;
X                                               }
X                                             $statistics{'FOUND'}++;
# total number of articles found during search
X                                             $found_in_group{$group}++;
# total number of articles found in this newsgroup
X                                             $total_articles_found++;
X                                             &ToMailBox(*text, $MboxFormat);

X                                           }

X                                           foreach $newsgroup (keys %range)
X                                           {
X                                               $ArticleInGroup =
X                                                   $NewsArticle{$newsgroup};

X                                                   split(',',
X                                                       $range{$newsgroup});
X                                              
X                                               &UpDateRange(*theRange,
X                                                          *ArticleInGroup);
X                                               $range{$newsgroup} =

X                                           }
X                                          
X                                       } #
X                                   else
X                                   {
X                                       warn "Warning! ARTICLE ",
"command returned unexpected status response: $status \n";
X                                   }
X                               }
X                               elsif($status == 423)
X                               {
# 423 no such article number in this group
X                                       warn "Warning! Article ",
"$i in Group $group does not exist!\n";
X                               }
X                               elsif($status == 430)
X                               {
# 430 no such article found
X                                   warn "Warning! Article $i ",
"in Group $group not found!\n";
X                               }
X                               else
X                               {
X                                   die "Aborting! ",
"STAT $i in Group $group returned unexpected status response: $status.\n";
X                               } # end if for result of STAT command
X                              
X                           }  # end if !&InRange  
X                       else    # skip article
X                       {       #
# print s to indicate skipping an excluded
# article
X                           if( ! $opt_s && ! $opt_X )
X                           {   #
X                               $flush_save = $|;
X                               $| = 1;
X                               print "s";
X                               $| = $flush_save;
X                           }   #
X                       }       #
X                      
X                       $article_count++; # number of articles scanned
X                        $total_articles_scanned++;
X
X                        if ($opt_X && ($opt_X eq "RUN")
X                           && ( time() - $oldTime > 1 ))
X                       {
X                           $percent = $article_count/$gn;
X                           $percent *= 100.0;
X                           printf "105 $group %4.2f %d %d %d %d\n",
$percent, $found_in_group{$group}, $article_count,
$total_articles_found, $total_articles_scanned;
X                           $oldTime = time();
X                       }
X
X                          
X                       if( ($article_count % 50) == 0)
X                       {       # print percent done every 50 articles
X                           $percent = $article_count/$gn;
X                           $percent *= 100.0;
X
#                           if ( $opt_X && $opt_X eq "RUN" )
#                           {
#                               printf "105 $group %4.2f %d %d %d %d\n",
# $percent, $found_in_group{$group};
#                           }
X
X                           if (! $opt_s && ! $opt_X )
X                           {
X                               $flush_save = $|;
X                               $| = 1;
X                               printf "( %4.2f\%", $percent;
X                               print " )";
X                               $| = $flush_save;
X                           }
X                       }
X                       &NNTPNext;  # go to next article in group
X               } # end while($do)
X       }                       # end if $gn > 0 (not an empty group )
X       }
X       else                    # doesn't recognize NNTP response to GROUP
X       {
X               die "Abort! NNTP GROUP $group ",
"command returned unexpected response: $status. \n";
X       }
Quote:
} # close loop over groups to search

print S "quit\n"; # close the connection to the server
close MBOX;       # close the file of found news articles
X
# update the configuration file
X
&FixRange(*range);  # clean up the range
X
foreach $group (keys %range)
{
X       $notPresent{$group} = 'T';
Quote:
}

X
foreach $i (0 .. $#config)
{
X       $_ = $config[$i];
X       foreach $group (keys %range)
X       {
X               $qgroup = $group;
X               $qgroup =~ s/(\W)/\\\1/g;
# quote non word characters (e.g. +)
X               if(s/$qgroup\s*:(.*)/$group:$range{$group}/)
X               {
X                       $notPresent{$group} = 'F';
X               }
X              
X       }
X       $config[$i] = $_;
Quote:
}

#
# append group range lines if don't exist
# only do this for groups that have been selected
# don't care about cross postings to groups that have not been
# selected
#
foreach $group (keys %range)  # keys of range are all groups to be searched
{
X       if($notPresent{$group} =~ /T/)
X       {
X               $line = "$group:$range{$group}\n";

X       }
Quote:
}

X
open(CONFIG,">$configFile");

close CONFIG;
# save statistics
X
if($statistics{'ALL'})
{
X       open(STAT,">>$statFile");  # open to append to file
X       print STAT "######\n";

X       print STAT "Statistics on regular expression incidence ",

X       ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)
X           = localtime(time);
X       $month = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$mon];
X       $year = '19' . $year;
X       print STAT "Collected $hour:$min:$sec (Local Time)  ",
"$month $mday, $year \n";
X       print STAT "Regular Expression (Perl Syntax) : ",
"Number of Articles With a Match \n";
X       $length = 0;
X       foreach $pattern (keys %statistics)
X       {
X               $curLength = length($pattern);
X               $length = $curLength if $curLength > $length;
X       }
X
X       if($length + 2 < 60)
X       {
X               $length = $length + 2;
X       }
X       else
X       {
X               $length = 60;
X       }
X
X       foreach $pattern (keys %statistics)
X       {
X               printf STAT "%-${length}s: %d\n", $pattern,  
$statistics{$pattern};
X       }
X       if($doPairs)
X       {
X         print STAT " Incidence of pairs of regular expressions \n";
X         print STAT " Only pairs that occur in at least one (1) ",
"article are reported. \n";   #
X         print STAT " <Perl Regular EXpression><Perl Regular Expression>",
" : Number of Articles with Match :  Correlation Coefficient \n";
X
X               $length = 0;
X               foreach $key (keys %pairs)
X               {
X                       $length = length($key) if length($key) > $length;
X               }
X
X               if($length < 58)
X               {
X                       $length += 2;
X               }
X               else
X               {
X                       $length = 60;
X               }
X
X               foreach $key (keys %pairs)
X               {
X                       $Nall = $statistics{'ALL'};
X                       $Npairs = $pairs{$key};
X                       ($pOne,$pTwo) = split(/\034/,$key);
X                       $NOne = $statistics{$pOne};
X                       $NTwo = $statistics{$pTwo};
X                       $muOne = $NOne/$Nall;
X                       $muTwo = $NTwo/$Nall;
X                       $varOne = $muOne - ($muOne * $muOne);
X                       $varTwo = $muTwo - ($muTwo * $muTwo);
X
X                       if($varOne && $varTwo)
X                       {
X                               $corr
X                                 = ( (($Npairs)/$Nall) - ($muOne * $muTwo))
X                                       /(sqrt($varOne) * sqrt($varTwo));
X                       }
X                       else
X                       {
X                               $corr = "UNDEFINED";
X                       }
X
X                       ($index = $key) =~ s/\034/ /;
X
X                       printf STAT "%-${length}s : %-10d : %5.3f \n",
$index, $pairs{$key}, $corr if $pairs{$key} && !($corr =~ "UNDEFINED");
X               }
X       }
X       close STAT;
Quote:
}

X
if(! $opt_s && ! $opt_X ) # Let user know where found articles are
{
X    $flush_save = $|;          # save original i/o buffering method
X    $| = 1;                    # force flush on each print or write
X    if(! defined $statistics{'FOUND'} )
X    {
X       $statistics{'FOUND'} = 0;
X    }
X
X  print "\nnewscan: Search completed! $statistics{'FOUND'} News articles ",
"with match saved in $mbox ! \n";
# think number of articles found is comforting to user
# - I find it comforting to see
X    $| = $flush_save;
Quote:
}

X
if( ( ! $opt_s && ! $opt_X ) && $statFile && -e $statFile )        
# if statistics file name defined
{
X    print "newscan: Search statistics saved in $statFile. \n";
# Let user know where to find statistics
Quote:
}

X
if( ! $opt_s && ! $opt_X )
{
X    print "newscan: Search Completed! \n"; # Final exit message
Quote:
}

X
if( $opt_X && $opt_X eq 'RUN' )
{
X  print "205 QUIT\n"; # let client (xnewscan) know newscan search terminated
Quote:
}

X
if( $opt_a )
{
X    &PlayFile($audio_file);        # play audio file to indicate completion
Quote:
}

X
##### Support Subroutines Follow #####
X
sub parse
{
# parse the configuration file (Network News Query Language NNQL)
#
# This subroutine handles lexical scanning and parsing of the newscan
# configuration file.  It fills in various data structures that drive
# the search.  These data structures function as an Intermediate
# Representation (IR) of the query in compiler terminology.  The search
# engine uses these data structures to perform the semantic processing
# in compiler terminology.  The semantics (meaning) in this case is
# finding articles with matches to the query.
#

X
X    $yyi = 0;                  # use yyi to avoid global conflict for now
X
X    &yyparse();                    # call byacc Perl parser
X
X    return 1;                  # return
Quote:
}

X
X
sub AssocSplice                 # splice into an associative array
{
# asterisk * prefix to pass variables by name (not value)

X

X    {
X       $assoc{$xGroup} = $assoc{$group};
X    }
X    delete $assoc{$group};
Quote:
}

X
sub RangeSplice                 # splice into range array
{

X

X    {
X       if( ! $assoc{$xGroup} )
X       {
X           $assoc{$xGroup} = $assoc{$group};
X       }
X    }
X    delete $assoc{$group};
Quote:
}

X
X
sub Collect
{


X       local($pattern);
X       local(%FoundPattern) = () if $doPairs;
X       local($pOne, $pTwo);
X       local(%done) = () ;  # array of found patterns
X
X       foreach $pattern (keys %statistics)
X       {



X                   || $pattern =~ 'FOUND';

X               {
X                       $FoundPattern{$pattern} = 1 if $doPairs;
X                       $statistics{$pattern}++;
X               }
X              
X       }
X       $statistics{'ALL'}++;  # count all articles scanned
X
X       if($doPairs)
X       {
X               foreach $pOne (keys %statistics)
X               {
X                       $done{$pOne} = 1;
X                       foreach $pTwo (keys %statistics)
X                       {
X                               if(!$done{$pTwo})
X                               {
X                                       if($FoundPattern{$pOne}
X                                          && $FoundPattern{$pTwo})
X                                       {
X                                               if($pairs{$pOne,$pTwo})
X                                               {
X                                                 $pairs{$pOne,$pTwo}++;
X                                               }
X                                               else
X                                               {
X                                                 $pairs{$pOne,$pTwo} = 1;
X                                               }
X                                       } # end if found pattern one and two
X                               } # end if pattern two not already checked
X                       } # end loop over pattern two
X               } # end loop over pattern one
X       }
X       return;
Quote:
}

X
sub NNTPNext
{
# group sending next command to NNTP server and code to parse response
X                               print S "next\n";
X                               $_ = <S>; # retrieve reply to next
X                               ($status,$article,$id,$rest) = split(/ /);
X                               if($status == 223)
X                               {
# 223 n a message
X                                   $i = $article; # $i is global variable
X                               # containing the article number
X                               }
X                               elsif($status == 421)
X                               {
# 421 no next article in group
X                                       $do = 0;
X                               }
X                               elsif( $status == 420 )
X                               {
# 420 no current article has been selected
X                                 warn "newscan: NNTP Server has returned ",
"420 no current article has been selected error message in response to
NNTP NEXT command.\n";
X                                   $do = 0;
X
X                               }
X                               else
X                               {
X                                die "Abort! Unexpected response $status ",
"from NEXT command.\n";
X                               }
Quote:
}

X
sub InRange
{

X

X
X       local($answer) = 0;  # return false unless in range
X       local($i);
# should use a more efficient search algorithm here now that ranges
# are sorted in ascending order
X
X       for $i (0 .. $#ranges)
X       {
X               $_ = $ranges[$i];
X               if(/^\s*(\d+)\s*$/)  
X               {
X                       $1 == $n ? ($answer = 1) : ($answer);
X               }
X               elsif(/^\s*(\d+)-(\d+)\s*$/)
X               {
X                       if($n >= $1 && $n <= $2)
X                       {
X                               $answer = 1; return $answer;
X                       }
X               }
X               else
X               {
X                 die "Abort in InRange! Syntax error in range: $_ \n";
X               }
X       }
X
#       print "InRange: answer is $answer \n";
X       return $answer
Quote:
}

X                      
X
X
X
sub ToMailBox
{
#
# Name: ToMailBox
# Date: 1993
# Author: John F. McGowan
#
# Description: Stores articles in mailbox format folder file.
#

X       local($path) = &GetPath(*text);
X       local($date) = &GetDate(*text);
X       if( $fmt =~ /unix/i )
X       {
X           local($header) = sprintf("%s %s %s\n","From",$path,$date);
#
# Problems with the mail (or Mail) mail reader.
#
# Unix mail, in its infinite wisdom, treats
# a blank line followed by a line starting with From followed
# by a space as the beginning of a new message.  Except, it appears from
# reports from users, for the very first article in the mail folder file.
# The mail mail reader will actually split a message containing a line
# starting with From into two messages.  elm seems to ignore a line
# in the body of the mail message starting with From.  So, following
# the sendmail convention, newscan will change From .... to >From ....
#
# Dave Taylor's elm mail reader on the other hand treats
# ^From\s+[^\s]\s+{Unix date}.*\n as start of a message!  So elm rarely
# has problem with lines starting with From in body of message and doesn't
# care about blank line before From
#
X

X           {
X               $line =~ s/^From />From /g;
X           }


X                               # for Unix mailer
X       }
X       elsif ($fmt =~ /elm/i)
X       {
X           local($header) = sprintf("%s %s %s\n","From",$path,$date);

X       }
X       elsif ($fmt =~ /mmdf/i) # MMDF mailbox format
X       {
X           local($header) = "\001\001\001\001\n";
X           local($trailer) = $header;


X       }
Quote:
}

X
sub GetPath
{

X       local($line) = '';
X       local($path) = '';
X

X       {
X               $_ = $line;
X               if(/^\s*[Pp]ath\s*:\s*([^\s]*)\s*$/)
X               {
X                       $path = $1;
X               }
X       }
X       if($path)
X       {
X               return $path;
X       }
X       else
X       {
X               warn "Warning! Could not find a Path line in article!\n";
X       }
Quote:
}

X
sub GetDate
{

X       local($line) = '';
X       local($date) = '';
X       local($date) = '';
X
#
# build regular expression that exceed 78 char limit
# some mail and news software have problems with files that exceed
# 80 chars per line.  
#
X       local($fmt1) = '(\w\w\w),\s+(\d\d?)\s+(\w\w\w)\s+(\d\d\d\d)';
X       local($fmt2) = '\s+(\d?\d:\d\d:\d\d)\s+(-\d\d\d\d)\s*';
X       local($date_fmt) = $fmt1 . $fmt2;
X
X       local($fmt3) = '(\w\w\w),\s+(\d\d?)\s+(\w\w\w)\s+(\d\d)';
X       local($fmt4) = '\s+(\d?\d:\d\d:\d\d)\s+(-\d\d\d\d)\s*';
X       local($date_fmt2) = $fmt3 . $fmt4;
X
X       local($fmt5) = '(\w\w\w),\s+(\d\d?)\s+(\w\w\w)\s+(\d\d\d\d)\s+';
X       local($fmt6) = '(\d?\d:\d\d:\d\d)\s+(\w\w\w)\s*';
X       local($date_fmt3) = $fmt5 . $fmt6;
X

X       {
X               $_ = $line;
X               if(/^\s*[Dd]ate\s*:\s*(.*)$/)
X               {
X                       $_ = $1;
X                       if(
/(\w\w\w),\s+(\d\d?)\s+(\w\w\w)\s+(\d\d)\s+(\d?\d:\d\d:\d\d)\s+(\w\w\w)\s*/)
X                       {
# dates of form: Tue, 18 May 93 11:24:33 GMT
X                               $wday = $1;
X                               $mday = $2;
X                               $month = $3;
X                               $year = '19' . $4;
X                               $time = $5;
X                               &FixTime(*time);
X                               $zone = $6;
X                               $date =
X                            join(' ',$wday,$month,$mday,$time,$year,$zone);
X                       }
X                       elsif(/$date_fmt2/)
X                       {
# dates of form: Tue, 18 May 93 11:24:33 -0400
X                               $wday = $1; $mday = $2; $month = $3;
X                               $year = '19' . $4;
X                               $time = $5;
X                               &FixTime(*time);
X                               $zone = 'GMT';  # kludge for this
X                               $date = join(' ',$wday,$month,
X                                            $mday,$time,$year,$zone);
X                       }
X                       elsif(/$date_fmt/)
X                       {
# dates of form: Tue, 18 May 1993 11:24:33 -0400
X                               $wday = $1; $mday = $2; $month = $3;
X                               $year = $4;
X                               $time = $5;
X                               &FixTime(*time);
X                               $zone = 'GMT';  # kludge for this
X                               $date = join(' ',$wday,
X                                         $month,$mday,$time,$year,$zone);
X                       }
X                       elsif(/$date_fmt3/)
X                       {
# dates of form: Tue, 18 May 1993 11:24:33 GMT
X                               $wday = $1; $mday = $2; $month = $3;
X                               $year = $4; $time = $5;
X                               &FixTime(*time);
X                               $zone = $6;
X                               $date = join(' ',$wday,
X                                         $month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d\d\d)\s+(\d?\d:\d\d:\d\d)\s+(\w\w\w)\s*/)
X                       {
# dates of form: 18 May 1993 11:24:33 GMT
X                               $mday = $1; $month = $2; $year = $3;
X                               $time = $4;
X                               &FixTime(*time);
X                               $zone = $5;
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date = join(' ',$wday,
X                                        $month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d)\s+(\d?\d:\d\d:\d\d)\s+(\w\w\w)\s*/)
X                       {
# dates of form 18 May 93 11:24:33 GMT
X                               $mday = $1;
X                               $month = $2;
X                               $year = '19' . $3;
X                               $time = $4;
X                               &FixTime(*time);
X                               $zone = $5;
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date = join(' ',$wday,
X                                         $month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d\d\d)\s+(\d?\d:\d\d)\s+(\w\w\w)\s*/)
X                       {
# dates of format 1 Jul 1993 05:54 CST
X                               $mday = $1;
X                               $month = $2;
X                               $year = $3;
X                               $time = $4 . ':00';
X                               &FixTime(*time);
X                               $zone = $5;
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date = join(' ',$wday,
X                                         $month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d)\s+(\d?\d:\d\d)\s+(\w\w\w)\s*/)
X                       {
# dates of format 1 Jul 93 05:54 CST
X                               $mday = $1;
X                               $month = $2;
X                               $year = '19' . $3;
X                               $time = $4 . ':00';
X                               &FixTime(*time);
X                               $zone = $5;
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date = join(' ',$wday,
X                                         $month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d)\s+(\d?\d:\d\d:\d\d)\s+(-\d\d\d\d)\s*/)
X                       {
# dates of form: 18 May 93 05:11:21 -0400
X                               $mday = $1;
X                               $month = $2;
X                               $year = '19' . $3;
X                               $time = $4;
X                               &FixTime(*time);
X                               $zone = 'GMT';
# temporary kludge until i figure out what -0400 means
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date = join(' ',$wday,
X                                         $month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d\d\d)\s+(\d?\d:\d\d:\d\d)\s+(-\d\d\d\d)\s*/)
X                       {
# dates of form: 18 May 1993 05:11:21 -0400
X                               $mday = $1;
X                               $month = $2;
X                               $year = $3;
X                               $time = $4;
X                               &FixTime(*time);
X                               $zone = 'GMT';
# temporary kludge until i figure out what -0400 means
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date =
join(' ',$wday,$month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d)\s+(\d?\d:\d\d:\d\d)\s*/)
X                       {
# dates of form: 18 May 93 05:11:21
X                               $mday = $1; $month = $2;
X                               $year = '19' . $3;
X                               $time = $4;
X                               &FixTime(*time);
X                               $zone = 'GMT';  #kludge
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date =
join(' ',$wday,$month,$mday,$time,$year,$zone);
X                       }
X                       elsif(
/(\d\d?)\s+(\w\w\w)\s+(\d\d\d\d)\s+(\d?\d:\d\d:\d\d)\s*/)
X                       {
# dates of form: 18 May 1993 05:11:21
X                               $mday = $1; $month = $2;
X                               $year = $3;
X                               $time = $4;
X                               &FixTime(*time);
X                               $zone = 'GMT';  #kludge
X                               $wday = &GetWeekDay($mday,$month,$year);
X                               $date =
join(' ',$wday,$month,$mday,$time,$year,$zone);
X                       }
X                       else
X                       {
X                               warn "Warning! Format of date used in
news article is not recognized by newscan: $_ \n";
warn "Warning! newscan is unable to reformat this date to the date format
required for the mailbox format file!\n";
warn "Warning! A dummy date is used so that the mailbox format file with
the stored articles will function with a mail reader!\n";
# use a dummy date below so it knows that I did find a Date line
# but could not parse date format
X                               $wday = 'Mon'; # kludge
X                               $mday = 1;
X                               $month = 'Jan';
X                               $year = '1800';
X                               $time = '12:00:00';  # noon
X                               $zone = 'GMT';
X                               $date = join(' ',$wday,
X                                        $month,$mday,$time,$year,$zone);
X                           }  # end parsing of date
X                       last;  
# leave loop over lines after first Date line
X                   }  # end if date line
X           }                   # end loop over lines in article
X
X       if($date)
X       {
X               return $date;
X       }
X       else
X       {
X               warn "Warning! Could not find Date line in article!\n";
X       }
Quote:
}

X
sub FixRange
{


X
X       foreach $group (keys %range)
X       {


X               &CleanUpRange(*sortedRange);

X       }
X       return;
Quote:
}

X
sub SortElements
{
X       if( ($minA,$maxA) = $a =~ /(\d+)\-(\d+)/)
X       {
X       }
X       elsif( ($minA) = $a =~ /(\d+)/)
X       {
X               $maxA = $minA;
X       }
X       else
X       {
X               die "Error in range\n";
X       }
X
X       if( ($minB,$maxB) = $b =~ /(\d+)\-(\d+)/)
X       {
X       }
X       elsif( ($minB) = $b =~ /(\d+)/)
X       {
X               $maxB = $minB;
X       }
X       else
X       {
X               die "Error in range\n";
X       }
X
X       if( $maxA > $maxB && $minA >= $minB)
X       {
X               return 1;
X       }
X       elsif( $maxB > $maxA && $minB >= $minA)
X       {
X               return -1;
X       }
X       else
X       {
X               return 0;
X       }
X
X
Quote:
}

X
X
sub CleanUpRange
{

X
X       local($i)=0;
X       local($next);
X       local($element);
X       local($newElement);
X       local($minLo, $maxLo, $minHi, $maxHi);
X
X       while($i < $#theRange)
X       {
X                       if(
X                       ($minLo,$maxLo) = $theRange[$i] =~ /(\d+)\-(\d+)/)
X                       {
X                       }
X                       elsif( ($minLo) = $theRange[$i] =~ /(\d+)/)
X                       {
X                               $maxLo = $minLo;
X                       }
X                       else
X                       {
X                               warn "CleanUpRange Error! \n";
X                       }
X
X                       if( ($minHi,$maxHi) = $theRange[$i+1]
=~ /(\d+)\-(\d+)/)
X                       {
X                       }
X                       elsif( ($maxHi) = $theRange[$i+1] =~ /(\d+)/)
X                       {
X                               $minHi = $maxHi;
X                       }
X                       else
X                       {
X                               warn "CleanUpRange Error! \n";
X                       }
X
X                       if($minHi <= ($maxLo + 1) && $minHi >= $minLo
&& $maxLo <= $maxHi)
X                       {
# merge two elements in the range
X                               $newElement = "$minLo\-$maxHi";

X                       }      
X                       else
X                       {
# don't merge elements -- move to next element in list
X                               $i++;  # increment element in range
X                       }                      
X
X       } # end while loop
X       return;
Quote:
}

sub UpDateRange
{

X
X       local($i) = 0;
X       local($lrange);
# theRange is an array consisting of article numbers nn or nn-mm
# where nn and mm are article numbers

X       {
X               local($_) = $lrange;
X               if(/^(\d+)$/)
X               {
X                       if($1 == $theArticle)
X                       {
X                               return 0;
X                       }
X                       elsif($1 == ($theArticle - 1) )
X                       {
X                               local($newRange) = join('-',$1,$theArticle);

X                               return 1; # return 1 if updated range
X                       }
X                       elsif($1 == ($theArticle + 1) )
X                       {
X                               local($newRange) = join('-',$theArticle,$1);

X                               return 1;  # return 1 if updated range
X                       }
X                       else
X                       {
X                       }
X               }
X               elsif(/^(\d+)-(\d+)$/)
X               {
X                       if($1 <= $theArticle && $2 >= $theArticle)
X                       {
X                               return 0;
X                       }
X                       elsif(($theArticle + 1) == $1 )
X                       {
X                               $newRange = join('-',$theArticle,$2);

X                               return 1; # returns one if updated range
X                       }
X                       elsif(($theArticle - 1) == $2 )
X                       {
X                               $newRange = join('-',$1,$theArticle);

X                               return 1;  # returns one if updated range
X                       }
X                       else
X                       {
X                       }
X               }
X               else
X               {
X               }
X               $i++;
X       }       # end loop over range list
# if it gets here then article is not in excluded range and
# is not one after an existing range

X       return 1;
Quote:
}

X
sub GetWeekDay
{

X
X       local($i) = 0;
X       local($days) = 0;  # Number of days since 1 Jan 1991
X                          # 1 Jan 1991 is a Tue
# count number of days in current year to date
X       while($Months[$i] ne $month && ($i < 12))
X       {
X               $days += $DaysInMonth{$Months[$i]};
X               if($Months[$i] eq 'Feb' && ( $year - 1988 ) % 4 == 0)
X               {
X                       $days++;  # add additional day for the leap year
X               }
X               $i++;
X       }
X
X       $days += $mday;
# count number of days in years since 1990 (counts 1991)
X       for(local($past) = 1991; $past < $year; $past++)
X       {
X               if(($past - 1988) % 4) # non-zero if not a leap year
X               {
X                       $days += 365;
X               }
X               else # a leap year
X               {
X                       $days += 366;
X               }
X       }
X
X       local($weekday) = $days % 7;  # at this point 0 is a Monday
X       return $DaysOfWeek[$weekday];
Quote:
}

X
sub Veto
{

# veto article if matches a veto pattern
X       local($search, $blatz);  # declare local variables


X       {

X               if($blatz)
X               {
X                       return 1; # veto this article
X               }
X       }
X       return 0;  # no veto
Quote:
}

X
sub GetXRef
{

# return array of news article numbers

X       local($line,$field,$key,$value);
X       local($iret) = 0;  # did not find an Xref: line in text

X       {
# parse Xref line
X               local($_) = $line;
X               chop;
X               if(/^Xref:/)
X               {



X                       {
X                               ($key,$value) = split(':',$field);
X                               $NewsArticle{$key} = $value;
X                               $iret = 1;
X                       }
X              
X               }  # end if Xref line
X       } # end loop over lines in article text
X       return $iret;
Quote:
}

X
sub FixTime
{

X       local($_) = $time;
X       if(/^\d\d:\d\d:\d\d$/)
X       {
# do nothing: time in correct format
X       }
X       elsif(/^\d:\d\d:\d\d$/)
X       {
X               $time = '0' . $time;
X
X       }
X       elsif(/^\d\d:\d\d$/)
X       {
X               $time = $time . ':00';
X       }
X       elsif(/^\d:\d\d$/)
X       {
X               $time = '0' . $time . ':00';
X       }
X       else
X       {
X               warn "Fixtime Warning!  Do Not Recognize
Time format of time: $time !\n";
X       }
X       return;
X
Quote:
}

X
sub PlayFile
{
#
#   Name: PlayFile
#   Author: John F. McGowan, Ph.D.
#  
#   Description: Use /dev/audio if it exists to play an audio file
#   if the audio file exists.  Otherwise use the Unix BEL to alert the
#   user that something has happened.
#
#   /dev/audio is u-law 8 bit output on Sun Sparcstation  with SunOS 4.x
#   /dev/audio is u-law 8 bit output on Solaris, but buffered differently
#   /dev/audio is Sun/Next format file audio output special file on Linux
#
#   SGI machines do not have /dev/audio interface
#   Sony NEWS line machines use /dev/sb0 to play Sun audio files
#

X
X    if(! -e AUDIO )            # check if handle to audio
X    {                          # device driver exists
X       open(AUDIO, ">/dev/audio"); # attempt to open audio device
X       if( -e AUDIO )
X       {
X           &PlayIt($file); # play the audio file
X       }
X       else
X       {
X           print "\a";               # use bell signal as default
X       }
X    }
X    else
X    {
X       &PlayIt(*file);             # play the audio file
X    }
Quote:
}

X
sub PlayIt
{

X
X    if( -e $file )
X    {
X       open(SOUND,$file);

X       close(SOUND);
X

X    }
X    else
X    {
X       print "\a";           # default to Unix bell
X    }
Quote:
}

X
##################################################
# Next few lines are legal in both perl and nroff
X.00;  # finish .ig
X
'di                      \" finish diversion -- previous line must be blank
X.nr nl 0-1               \" fake up transition to first page again
X.nr % 0                  \" start at page 1
';__END__ #### From here on it's a standard manual page ###
X
X.TH NEWSCAN 1 "June 3, 1993"
X.AT 3
X.SH NAME
newscan \- scans Usenet news for articles matching regular expressions
(uses perl regular expressions).  Articles are saved in a mail folder, a
file in mailbox format.
X.SH SYNOPSIS
X.B newscan [-c configuration-file] [-e] [-r folder-file-specification]
[-H name-of-local-host][-h] [-s]
X.SH DESCRIPTION
X.I newscan
searches selected newsgroups for articles matching patterns.  Patterns are
specified as regular expressions in a configuration or resource file.  Also
may specifify patterns to veto articles.  If an article contains this
pattern, the article will be ignored even if it contains a search pattern.
The configuration file is specified with the command line argument
-c configuration-file or in the environment variable NEWSCAN.  The command
line argument takes precedence over the NEWSCAN environment variable. If
NEWSCAN is undefined, the configuration file defaults to .newscanrc in the
user's home directory.
X
Typically, newscan is run as a batch job:
X       newscan -s &      ( -s to turn off in progress messages )
OR
X       newscan > newscan.out &  ( redirect in progress messages to file )
X
X.SH OPTIONS
X
X       What You Want To Do             Option
X
X       Get help on
X.I newscan                             -h
X
X       Edit Configuration File         -e
X
X       Override Default Configuration File     -c configuration-file
X
X       Invoke Mail Reader to Read a Folder     -r folder
X
X        Explicitly Set Local Host Name          -H name-of-local-host
X
X       Silent Mode (Suppress In Progress Messages)  -s
X
X
X.SH CONFIGURATION FILE
newscan is controlled by a resource or configuration file containing
command in a simple language that newscan understands.  These commands
should be all capitals.
X
Configuration file commands:
X
NNTP them.them.com [119] specifies the Internet address of the NNTP
server.  The first argument is the Internet address either as a Fully
Qualified Domain Name or the dotted decimal format for the 32 bit Internet
address.  The optional second argument is the port number of the NNTP
server; this should be 119 if NNTP specification is followed.  Port
119 is reserved for NNTP.
X
X    If this line is not specified, newscan will use the environment
variable NNTPSERVER if it is defined.  If NNTPSERVER is not defined, newscan
will use the local machine as default NNTP server.  The local machine is
frequently not an NNTP server.
X
MBOX <my-file-specification> specifies the file where the found
articles are stored.  This file is a mail folder that may be
read and manipulated by any mailer (Mail User Agent)
<my-file-specification> can be any valid Unix
file-specification (including path).  MBOX interprets a leading tilde
X~ as the user's home directory: e.g.  MBOX ~/tmp/myfile.
X
SELECT my.group his.group alt.group specifies the Internet newsgroups
for which subsequent search specifications apply.  newscan searches all
newsgroups specified by SELECT lines.
X
SELECT supports a wildcard * in newsgroups.  The wildcard works like
the wildcard in file names in Unix shells. For example, SELECT
comp.lang.* will search all groups in the comp.lang hierarchy:
comp.lang.fortran, comp.lang.forth, ..., comp.lang.Pascal, etc.
X
REQUIRE /regexp/[i] specifies a perl regular expression that MUST be
found for a match to occur.  This search criterion applies only to
the newsgroups specified by the last preceding SELECT line.  
X
REQUIRE provides a mechanism to search for articles that contain a
match to more than one search pattern.  For example, REQUIRE /pat1/i
and REQUIRE /pat2/i will find articles that contain pat1 AND pat2; it
will not find an article that has pat1 but NOT pat2 or pat2 but NOT
pat1.  REQUIRE provides a simple mechanism to perform a logical AND of
two search patterns.
X
WHERE /regexp/[i] specifies the perl regular expression to be found.
As in perl, the optional trailing /i tells newscan to ignore case.  This
search criterion applies only to the newsgroups specified by the
last preceding SELECT line.
X
WHERE provides a mechanism to search for articles that contain a match
to one or more of a group of search patterns.  For example, WHERE /pat1/i
and WHERE /pat2/i will find articles that contain pat1 OR pat2 OR both.
Notice the contrast to REQUIRE above!
X
UNLESS /regexp/[i] specifies the perl regular expression used to
exclude an article.  Even if the article contains a match to a WHERE
regular expression it will be excluded (not found) if it contains a
match to an UNLESS expression.  As in perl, the optional trailing /i
tells newscan to ignore case.
X
WORD blatz is equivalent to /\\bblatz\\b/i A configuration file can
contain lines such as WHERE WORD vision or REQUIRE WORD finance.
WHERE WORD vision matches the word vision.  WHERE WORD vision would
not match computer-vision since vision is not a word in this case.
This provides a somewhat simpler way to specify searches without
knowing Perl regular expressions.
X
WORDSTEM blatz is equivalent to /\\bblatz.*\\b/i  A configuration file
can contain lines such as WHERE WORDSTEM imag or REQUIRE WORDSTEM imag.
WHERE WORDSTEM imag would match words image, imaging, imager, and so forth.
This provides a somewhat simpler way to specify searches without knowing
Perl regular expressions.
X
PHRASE word1 word2 [... wordn] is equivalent to
/\\bword1\\b\\s+\\bword2\\b/i A configuration file can contain lines
such as WHERE PHRASE tcl programming or REQUIRE PHRASE tcl
programming.  This provides a somewhat simpler way to specify searches
without knowing Perl regular expressions.
X
OUTLINE word1 word2 [... wordn] is equivalent to
/\\bword1\\b.*\\bword2\\b/i A configuration file can contain lines
such as WHERE OUTLINE software engineer or REQUIRE OUTLINE windows
engineer.  OUTLINE windows engineer would match ms windows engineer,
windows software engineer, as well as windows engineer.  Note that
this is different from PHRASE windows engineer which would only match
windows engineer!  This provides a somewhat simpler way to specify
searches without knowing Perl regular expressions.
X
COLLECT STATISTICS IN <file-specification> line tells newscan to collect
statistics on the incidence of regular expressions in all articles scanned
(not just articles that match).  These statistics are printed in human
readable form in the file <file-specification>.  newscan appends the
statistics information to <file-specification>.
X
COLLECT STATISTICS ON /regexp/[i] line tells newscan to collect
statisitcs on incidence of perl regular expression regexp.  newscan
counts the number of articles that contain at least one match to
regexp.
X
COLLECT STATISTICS ON {SEARCH|WHERE} PATTERNS tells newscan to collect
statistics on all perl regular expressions specified by WHERE /regexp/[i]
lines in the configuration file.
X
COLLECT STATISTICS ON {VETO|UNLESS} PATTERNS tells newscan to collect
statistics on all perl regular expressions specified by UNLESS /regexp/[i]
lines in the configuration file.
X
COLLECT STATISTICS ON PAIRS tells newscan to collect statistics on all
pairs of perl regular expressions for which statistics are being
collected.  newscan counts the number of articles containing at least
one match to both regular expressions.  newscan also calculates a
simple correlation coefficient between the two regular expressions in
the pair.  Note that if the correlation coefficient is 1.0, then the
two regular expressions always occur together; this means one regular
expression is a redundant (unneeded) search criterion.
X
my.favorite.group:1-1100,1105-1110 is a line in the configuration file
that lists ranges of articles in a group that are excluded from
search.  newscan updates this range after it finishes to avoid
repeating a search.  If this line is not provided by the user, newscan
will generate the group range line and append it to the end of the
configuration file.  newscan generates the group range line for any
other groups that a found article has been posted to.
X
X.SH SAMPLE RESOURCE FILE
X
NNTP nntphost
X
MBOX myBox
X
SELECT misc.jobs.offered ba.jobs.offered
X
WHERE /gui/i
X
WHERE /motif/i
X
WHERE /graphic/i
X
UNLESS /From:.*Headhunter/i
X
UNLESS /Subject:.*Recruit/i
X
misc.jobs.offered:1-1000
X
ba.jobs.offered:
X
X.SH ENVIRONMENT
X.I NEWSCAN
environment variable defines the configuration or resource file that
controls the search.  If NEWSCAN is not defined, then newscan defaults
to the file .newscanrc in the user's home directory.  NEWSCAN is
superseded by the -c configuration-file command line argument.
X
X.I NNTPSERVER
environment variable specifies the Internet address of the system NNTP
server.  The NNTP <nntphost> line in the configuration file takes
precedence over the NNTPSERVER variable.
X
X.I READER
environment variable selects the mail reader used by newscan.  If READER
is not set, newscan will try to find and use a mail reader of its choosing.
newscan will use Dave Taylor's elm if it exists.
X
X.I EDITOR
environment variable selects the editor used by newscan.  If EDITOR is not
set, newscan will try to use first emacs and then vi.
X
X.I LOCALHOST
environment variable explicitly sets the name of the local host.  newscan
defaults to using the Unix
X.B
hostname
utility to get the local host name.  In most cases, there is no need to
set LOCALHOST.  However, at least on Linux, if the local host's official
name returned by hostname is a number such as 440, the
X.B
gethostbyname
function called by newscan will not return the correct Internet address.  
In this case, set LOCALHOST to a valid alias of the official name that is
not a number.  For example, 440 might have an alias 440.rahul.net.  Use
440.rahul.net.
X
X.SH FILES
X.I $HOME/.newscanrc
is the default resource or configuration file
specifying the search.  This can be overridden from the command line
(using -c configuration-file) or by setting the NEWSCAN environment
variable.  The command line argument supersedes the NEWSCAN
environment variable.
X
X.I newscanBox
is the default file where newscan saves found articles in mailbox
format.  This can be overridden by using the MBOX file-name command in
the configuration file.
X
X.SH PERL REGULAR EXPRESSIONS
X
newscan uses Perl regular expressions to specify the patterns that it
searches for in the selected newsgroups.  Perl regular expressions are
similar to the regular expressions used in sed, vi, and emacs, but
more extensive.
X
X    .           Matches any character except for newline
SHAR_EOF
  : || echo 'restore of newscan failed'
fi
echo 'End of newscan part 2'
echo 'File newscan is continued in part 3'
echo 3 > _sharseq.tmp
exit 0
--



Fri, 29 Aug 1997 07:13:37 GMT  
 
 [ 1 post ] 

 Relevant Pages 

1. newscan 2.0 - a Perl Network News Scanner (Part 3 of 4)

2. newscan 2.0 - a Perl Network News Scanner (Part 1 of 4)

3. newscan 2.0 - Announcement for Network News Scanner

4. newscan 1.105 - a Network News Scanner (Part 2 of 3)

5. newscan 1.66 - a Perl Network News Article scanner (requires NNTP)

6. newscan 1.45 - a NetNews network news article scanner

7. Announcing newscan 1.105 - a Network News Scanner

8. newscan 1.45 (a news scanner) in comp.sources.misc

9. Announcing newscan 1.66 - NNTP NetNews Scanner!

10. newscan 2.0 - xnewscan errors

11. newscan 2.0 - MODE READER Problem

12. looking for net news scanner

 

 
Powered by phpBB® Forum Software