#!/usr/bin/perl # http://www.cs.dal.ca/~vlado/srcperl/report-new.pl # Copyright 2000-7 Vlado Keselj www.cs.dal.ca/~vlado sub help { print STDERR <<"#EOT" } # Report new material on a web page, version $VERSION # # The program normally runs as a cron job, so the result is sent as an # e-mail if it something changes. It is convenient for keeping a # watch on interesting web pages. It uses diff, lynx or wget, # sendmail (if option -e is used) # # Usage: report-new.pl [switches] URL # -h Print help and exit. # -v Print version of the program and exit. # -e email Sends output, if not empty, to email. # -D dir Working directory. By default, ${HOME}/agn is used if # exists, otherwise the current directory. # -d method Dumping method. The default is lynx_txt; the other # option is wget. # -g Debugging mode. The new file is not fetched, but the program # reruns on the last two different versions of the page. # -p procf Input processor. There is supposed to be a file named # procf that can be 'require'-ed, in Perl sense, which defines # a subroutine named procf, which is used to filter the file # before making diff. The escape string '!KEEP: ' can be left # at the beginning of the lines which we want to appear in diff # whenever there is a difference. # More documentation included close to the end of the source file. #EOT use strict vars; use POSIX qw(strftime); use Carp; use vars qw( $VERSION ); $VERSION = sprintf "%d.%d", q$Revision: 1.16 $ =~ /(\d+)/g; use Getopt::Std; use vars qw($opt_v $opt_h $opt_e $opt_d $opt_D $opt_p $opt_g); getopts("hve:D:d:p:g"); if ($opt_v) { print "$VERSION\n"; exit; } elsif ($opt_h || !@ARGV) { &help(); exit; } my $dump = \&dump_lynx_txt; if ($opt_d ne '') { if ($opt_d eq 'lynx_txt') { $dump = \&dump_lynx_txt } elsif($opt_d eq 'wget' ) { $dump = \&dump_wget } else { &help(); print STDERR "error: '-d $opt_d'"; exit -1; } } if ($opt_D eq '') { if (-d $ENV{HOME}."/agn") { $opt_D = $ENV{HOME}."/agn" } else { $opt_D = '.' } } elsif (! -d $opt_D ) { die "directory \"$opt_D\" does not exist" } ($#ARGV==0 && $ARGV[0]=~/^http:\/\//) || die "Format: report-new.pl http://...\n"; my ($urlbase, $url); $urlbase = $url = shift; # E.g.: http://www.cs.dal.ca/~vlado/srcperl if ( $url =~ m.//[^/]*/. ) { $urlbase = $`.$& } # E.g.: http://www.cs.dal.ca/ my $urlId = &encode_w1($url); my $timestamp = strftime("%Y-%m-%d-%T", localtime(time)); if (! -d "$opt_D/tmp") { mkdir "$opt_D/tmp", 0700 or die "can't mkdir $opt_D/tmp: $!" } if (! -d "$opt_D/report-new.pl.d") { mkdir "$opt_D/report-new.pl.d", 0700 or die "can't mkdir $opt_D/report-new.pl.d: $!" } chdir $opt_D; my $TmpBase = "$opt_D/tmp/$urlId-$timestamp"; my $TmpFile1 = "$TmpBase-1"; my $TmpFile2 = "$TmpBase-2"; my $lastFile = "$opt_D/report-new.pl.d/$urlId.last"; -e $lastFile or putfile($lastFile,''); my $lastFile1 = "$opt_D/report-new.pl.d/$urlId.last-1"; -e $lastFile1 or putfile($lastFile1,''); # First step: fetch the page, unless option -g is given if (! $opt_g ) { &$dump($url, $TmpFile1); my $f1 = getfile($lastFile); my $f2 = getfile($TmpFile1); if ($f1 eq $f2) { rename($TmpFile1, $lastFile); &_exit(0); } # nothing changed, exit rename($lastFile, $lastFile1); rename($TmpFile1, $lastFile); } my $material = getfile($lastFile); my $material1 = getfile($lastFile1); if ($opt_p) { require $opt_p; $material = &$opt_p($material); $material1 = &$opt_p($material1); } putfile($TmpFile1, $material); putfile($TmpFile2, $material1); my $diffres = `diff $TmpFile1 $TmpFile2 2>&1`; $diffres =~ s/^[^<].*\n//mg; $diffres =~ s/^< //mg; if ($diffres ne '') { if ($opt_p && index($material, '!KEEP: ') > -1) { $material =~ s/^!KEEP: //mg; putfile($TmpFile1, $material); $diffres = `diff $TmpFile1 $TmpFile2 2>&1`; $diffres =~ s/^[^<].*\n//mg; $diffres =~ s/^< //mg; } if ($opt_e) { my $out; open($out, "|sendmail -t") or die; print $out "To: $opt_e\n". "Subject: [report-new.pl] $url\n\n$diffres"; close($out); } else { print $diffres } } &_exit(0); sub _exit { my $r = shift; unlink $TmpFile1 if -e $TmpFile1; unlink $TmpFile2 if -e $TmpFile2; exit $r; } sub putfile($@) { my $f = shift; local *F; open(F, ">$f") or die "putfile:cannot open $f:$!"; print F '' unless @_; while (@_) { print F shift(@_) } close(F) } sub encode_w1( $ ) { local $_ = shift; s/[\W_]/'_'.uc unpack("H2",$&)/ge; return $_; } sub dump_lynx_txt { my $url = shift; my $file = shift; local *F; open(F,"|lynx -dump -nolist - > \Q$file") or die "lynx error:$!"; print F $url; close(F); } sub dump_wget { my $url = shift; my $file = shift; system('wget', '--quiet', '-O', $file, $url); } sub getfile($) { my $f = shift; local *F; open(F, "<$f") or croak "getfile:cannot open $f:$!"; my @r = ; close(F); return wantarray ? @r : join ('', @r); } __END__ =head1 NAME report-new.pl - Report new material on a web page =head1 SYNOPIS report-new.pl [switches] URL =head1 DESCRIPTION Reports new material on a web page. Typically used as a cron job. -h Print help and exit. -v Print version of the program and exit. -e email Sends output, if not empty, to email. -D dir Working directory. By default, ${HOME}/agn is used if exists, otherwise the current directory. -d method Dumping method. The default is lynx_txt; the other option is wget. -p procf Input processor. There is supposed to be a file named procf that can be 'require'-ed, which defines a subroutine named procf, which is used to filter the file before making diff. The escape string '!KEEP: ' can be left at the beginning of the lines which we want to appear in diff whenever there is a difference. =head1 PREREQUISITES POSIX qw(strftime); uses diff, lynx or wget, sendmail (if option -e is used). =head1 SCRIPT CATEGORIES Web =head1 README Reports new material on a web page. =head1 SEE ALSO Scripts: wget =head1 THANKS I would like to thank Peet Moris for bug reports and comments. =head1 COPYRIGHT Copyright 2000-7 Vlado Keselj F This script is provided "as is" without expressed or implied warranty. This is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The latest version can be found at F. =cut