#!/usr/bin/perl
# History
#
# 1.2 19970910 added forward link reporting and broken link highlighting
# and chunked up code into subroutines
#
# 1.1.3 added optional reading of compressed log files
#
# 1.1.2 added blanking filter for random URL things
#
# 1.1.1 added optional stripping of '?' parms from target
# and referring URLS
#
# 1.1 added command line parsing, support for multiple log formats
# and inclusion of previous reports and multiple log files. Made hotlinking
# mandatory, and made the exclusion patterns case-insensitive
#
# 1.0.1 corrected a problem with double counting hits from
# people who use directoryname instead of directoryname/ to access
# default html files (caused by 1.4 counting both the
# original hit and the redirect), combined #anchor hits with
# their base documents and removed redundant ':80' port specifications
# from reported URLS. Thanks goes to richie@ljouwert.et.tudelft.nl
# (http://morra.et.tudelft.nl/~richie/) for these suggested improvements.
#
# Also, the report is now sorted by number of hits as well as by
# target page and the ability to exclude particular pages from
# the report has been added, both as refering pages and as target pages.
$version="1.2.1";
# For decompressing logs, if necessary
$ungzip = "/bin/gzip -cd ";
$unbzip2 = "/usr/bin/bzip2 -cd ";
# Exclusion patterns (uncomment to use)
$EXCLUDEREFSTO='\.gif|\.jpg';
#$EXCLUDEREFSFROM='^http:\/\/www.devilbunnies.org|\-$';
# URL blanking filters for targets
$BLANKTARGET ='/chat.cgi/.*$';
$BLANKTARGETREPLACE = '/chat.cgi';
# $BLANKTARGET ='(\/\d+\-\d+\-menu.asis|\/\d+\-\d+.asis|\?newsgroup.*)$';
# $BLANKTARGETREPLACE = '(multiple)';
# URL blanking filters for referrer
$BLANKREFER ='/chat.cgi/.*$';
$BLANKREFERREPLACE = '/chat.cgi';
# $BLANKREFER ='(\/\d+\-\d+\-menu.asis|\/\d+\-\d+.asis|\?newsgroup.*)$';
# $BLANKREFERREPLACE = '(multiple)';
# Strip '?' variable parameters from target URLs
$STRIPTARGETPARMS=1;
# Strip '?' variable parameters from referring URLs
$STRIPREFPARMS=0;
# default path to the referer_log
$RefererLog="/www/common/logs/nihongo/access_log";
# Name of server
$HTTPDSERVER='www.nihongo.org';
# Minimum number of references for a ref to be included in the list
$MinRefs=2;
# Log type (combined, multihost-combined, referer)
$LogType="combined";
# Roxen hack - for 400 series status codes, get the referrer from the
# RFC931 field because the Roxen server does stupid stuff
$roxenhack=0;
# Print the Broken Links Report
$PrintBrokenLinks=1;
# Print the Reverse Link Report
$PrintReverseLinks=1;
# Print the Forward Link Report
$PrintForwardLinks=1;
# Restrict a multihost-combined analysis to webhosts matching the following regex
$Only_Specified_Host = '^(www\.)?nihongo\.org';
# Include or don't include webhost in local page target URL
$Include_Webhost_In_Target = 0;
# Protocal to assume for local page target URLs if webhost is included in the local target URL
$Assumed_Protocal = 'http';
&Initialize;
&IncludeOldFiles($IncludeFile) if ($IncludeFile);
@ARGV = ("$RefererLog") if ($#ARGV == -1);
&ReadLogFiles(@ARGV);
if ($OutputFile) {
open (OUTPUTFILE,">$OutputFile") || die ("Could not open $OutputFile for writing.\n$!");
binmode OUTPUTFILE;
select(OUTPUTFILE);
}
print <<"EOF";
Referring URL Statistics for ${HTTPDSERVER}
$refscounter references examined this run.
Last updated:
EOF
print scalar(localtime);
print <<"EOF";
This is a report of what URL browsers reported as the
referring URL that directed them to a particular web page here.
Inaccuracies are due mainly to some browsers reporting wrong
information under some conditions.
EOF
if ($MinRefs > 1) {
print "Only referring URLs with at least $MinRefs reports were included ",
"in this list.\n
\n";
}
print "
\n\n\n";
&DoBrokenLinks if ($PrintBrokenLinks);
&DoForwardLinks if ($PrintForwardLinks);
&DoReverseLinks if ($PrintReverseLinks);
print <<"EOF";
Statistics generated by
RefStats $version
/
snowhare\@nihongo.org
EOF
select(STDOUT);
# All Done. Everything after this is subroutines.
# Print all the broken links
sub DoBrokenLinks {
print <<"EOF";
EOF
$LastWebPage='';
foreach $key (sort bytargetthenhits keys(%TargetCounter)) {
next if ($StatusFlag{$key} != 404);
($target,$referrer)=split(/ /,$key,2);
if ("$target" ne "$LastWebPage") {
print "- $target
\n";
}
print "- $referrer",
" ($TargetCounter{$key} reference";
if ($TargetCounter{$key} > 1) {
print "s";
}
if ($StatusFlag{$key} == 404) {
print ', Broken Link';
}
print ")
\n";
$LastWebPage=$target;
}
print <<"EOF";
Return To Top
EOF
}
# Print all the reverse links
sub DoReverseLinks {
local ($Status,$LastWebPage,$key,$target,$referrer);
print <<"EOF";
$StartReverse
EOF
$LastWebPage='';
foreach $key (sort bytargetthenhits keys(%TargetCounter)) {
($target,$referrer)=split(/ /,$key,2);
next if ($TargetCounter{$key} < $MinRefs);
if ("$target" ne "$LastWebPage") {
print "- $target
\n";
}
print "- $referrer",
" ($TargetCounter{$key} reference";
if ($TargetCounter{$key} > 1) {
print "s";
}
$Status=$StatusFlag{$key};
if ($Status == 404) {
print ', Broken Link';
}
print ")
\n";
$LastWebPage=$target;
}
print <<"EOF";
$EndReverse
Return To Top
EOF
}
# Print all the forward links
sub DoForwardLinks {
local ($Status,$LastWebPage,$key,$referrer,$target);
print <<"EOF";
$StartForward
EOF
$LastWebPage='';
foreach $key (sort bysourcethenhits keys(%SourceCounter)) {
($referrer,$target)=split(/ /,$key,2);
next if ($SourceCounter{$key} < $MinRefs);
if ("$referrer" ne "$LastWebPage") {
print "- $referrer
\n";
}
print "- $target ($SourceCounter{$key} reference";
if ($SourceCounter{$key} > 1) {
print "s";
}
$Status=$StatusFlag{"$target $referrer"};
if ($Status == 404) {
print ', Broken Link';
}
print ")
\n";
$LastWebPage=$referrer;
}
print <<"EOF";
$EndForward
Return To Top
EOF
}
# for sorting the reverse report
sub bytargetthenhits {
($targeta)=($a=~m#^(.+)\s#o);
($targetb)=($b=~m#^(.+)\s#o);
$inequality=($targeta cmp $targetb);
if ($inequality) {
$inequality;
} else {
$TargetCounter{$b}<=>$TargetCounter{$a};
}
}
# for sorting the forward report
sub bysourcethenhits {
($sourcea)=($a=~m#^(.+)\s#o);
($sourceb)=($b=~m#^(.+)\s#o);
$inequality=($sourcea cmp $sourceb);
if ($inequality) {
$inequality;
} else {
$SourceCounter{$b}<=>$SourceCounter{$a};
}
}
# Merge old reports with this one. Is order of operation dependant
# - you can end up with bogus 'Status' codes if you merge new
# to old instead of old to new.
sub IncludeOldFiles {
local (@IncludeFilesList)=@_;
local ($target,$source,$line,$Section,$GotData,$IncludeFile);
foreach $IncludeFile (@IncludeFilesList) {
if (! open (INCLUSION,$IncludeFile)) {
warn "Could not open $IncludeFile for inclusion.\n$!";
return;
}
binmode INCLUSION;
$Section=0;
while ($line = ) {
chop $line;
# The funky if statements are so you
# can include or omit everything except the
# broken links report and still get your data
# The bailout is so the order of the reports
# can be changed without breaking the code
if ($line eq $StartForward) {
$Section=1;
next;
}
if ($line eq $EndForward) {
$Section=0;
last;
}
if ($line eq $StartReverse) {
$Section=2;
next;
}
if ($line eq $EndReverse) {
$Section=0;
last;
}
next if (! $Section);
if ($Section == 2) { # Reverse
if ($line =~ m#^(.*)#o) {
$target = $1;
next;
}
next if (! $target);
if ($line=~
m#^.*\s*\((\d+)\s*reference.*\).*$#oi){
$TargetCounter{"$target $1"} += $2;
$SourceCounter{"$1 $target"} += $2;
$refscounter += $2;
$StatusFlag{"$target $1"} = $3;
}
}
if ($Section == 1) { # Forward
if ($line =~
m#^.*#o) {
$source = $1;
next;
}
next if (! $source);
if ($line=~
m#^(\S*)\s+\((\d+)\s*reference.*\).*$#oi){
$TargetCounter{"$1 $source"} += $2;
$SourceCounter{"$source $1"} += $2;
$refscounter += $2;
$StatusFlag{"$1 $source"} = $3;
}
}
}
close(INCLUSION);
}
}
sub Initialize {
&ReadCommandLine('include:output:name:logtype:exfrom:exto:minrefs:stripref:striptarget');
# Old file to read (optional)
$IncludeFile = $opt{'include'} if ($opt{'include'});
# Output file (optional - goes to STDOUT if not specified)
$OutputFile = $opt{'output'} if ($opt{'output'});
# Type of log (legal: combined, referer)
$LogType = $opt{'logtype'} if ($opt{'logtype'});
# Exclusion patterns
$EXCLUDEREFSFROM= $opt{'exfrom'} if ($opt{'exfrom'});
$EXCLUDEREFSTO= $opt{'exto'} if ($opt{'exto'});
# '?' parameter stripping options
$STRIPTARGETPARMS = $opt{'striptarget'} if (defined($opt{'striptarget'}));
$STRIPREFPARMS = $opt{'strip'} if (defined($opt{'stripref'}));
# Name of server
$HTTPDSERVER=$opt{'name'} if ($opt{'name'});
# Minimum number of references for a reference to be included in the list
$MinRefs = $opt{'minrefs'} if ($opt{'minrefs'});
# Markers for the various log sections
$StartForward="";
$EndForward ="";
$StartReverse="";
$EndReverse ="";
$refscounter=0;
}
sub ReadCommandLine {
# parse list has the form 'a:b:c'
# flags with parse list entries must take values
local($parselist)=$_[0];
local(@CommandLine)=@ARGV;
local(@ParseList,%ParseRules,@GenericList);
(@ParseList)=split(/:/,$parselist);
foreach $item (@ParseList) {
$ParseRules{$item}=1;
}
while ($parm=shift(@CommandLine)) {
if ($parm =~ m#^\-([a-zA-Z]+)$#o) {
$parm=$1;
$opt{$parm}=1;
if ($ParseRules{$parm}) {
$value=shift(@CommandLine);
if ($value eq "") {
die ("Invalid comand line switch usage, '-$parm' requires value\n");
}
$opt{$parm}=$value;
}
next;
}
push(@GenericList,$parm);
}
@ARGV=@GenericList;
}
sub ReadLogFiles {
local (@LogFilesList)=@_;
local ($Domain,$rfc931,$authuser,$TimeDate,$Request,
$Status,$Bytes,$referrer,$Agent,$Method,$target,
$Protocal,$LogFile,$OpeFile,$line,$keyform,$key,
$value);
foreach $LogFile (@LogFilesList) {
$OpeFile=$LogFile;
if ($LogFile =~ m#\.gz$#o) {
$OpeFile="$ungzip $LogFile |";
} elsif ($LogFile =~ m#\.bz2#) {
$OpeFile = "$unbzip2 $LogFile |";
}
if (! open(REFSLOG,$OpeFile) ) {
warn "Can't open $LogFile. Skipped.\n $!";
next;
}
binmode REFSLOG;
while($line=) {
$refscounter++;
chop $line;
if ($LogType eq 'combined') {
($Domain,$rfc931,$authuser,$TimeDate,$Request,$Status,$Bytes,$referrer,$Agent) = $line =~
/^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+) \"?([^"]*)\"? \"([^"]*)\"/o;
($Method,$target,$Protocal)=split(/\s/,$Request,3);
} elsif ($LogType eq 'multihost-combined') {
($webhost, $Domain,$rfc931,$authuser,$TimeDate,$Request,$Status,$Bytes,$referrer,$Agent) = $line =~
/^(\S+) (\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+) \"?([^"]*)\"? \"([^"]*)\"/o;
($Method,$target,$Protocal)=split(/\s/,$Request,3);
next if ($Only_Specified_Host and $webhost !~ m/$Only_Specified_Host/o);
} elsif ($LogType eq 'referer') {
($referrer,$target)=split(/ -> /,$line,2);
}
$referrer =~ s/^\-$//o;
$target =~ s/^\-$//o;
if (! ($referrer && $target) ) {
next if (! $roxenhack);
($Domain,$referrer,$authuser,$TimeDate,$Request,$Status,$Bytes)
= $line =~
/^(\S+) \"?([^"]*)\"? (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+)/o;
($Method,$target,$Protocal)=split(/\s/,$Request,3);
$referrer =~ s/^\-$//o;
$target =~ s/^\-$//o;
next if (! ($referrer && $target));
}
$target=~ s/\%7[eE]/~/o; # Caniconalize %7E and %7e as ~
$target=~ s#//#/#go; # Remove any extra slashes
$target=~ s#^ *$#/#o; # fix root ref if needed
$target=~ s/#.+$//o; # combine #anchor refs with root doc
$target=~ s/\?.*$//o if $STRIPTARGETPARMS; # strip '?' parameters
$target = "$Assumed_Protocal://$webhost$target" if $Include_Webhost_In_Target;
if ($BLANKTARGET) {
$target =~ s/$BLANKTARGET/$BLANKTARGETREPLACE/o;
}
next if ($EXCLUDEREFSTO && ($target=~m#$EXCLUDEREFSTO#io));
$referrer=~ s/\%7[eE]/~/o; # Caniconalize %7E and %7e as ~
$referrer=~ s#^(http://[^/]+):80/#$1/#o; # remove unneeded :80 port specification
$referrer=~ s/#.+$//o; # combine #anchor refs with root doc
$referrer=~ s/\?.*$//o if $STRIPREFPARMS; # strip '?' parameters
if ($BLANKREFER) {
$referrer =~ s/$BLANKREFER/$BLANKREFERREPLACE/o;
}
next if ($EXCLUDEREFSFROM && ($referrer=~m#$EXCLUDEREFSFROM#oi));
$keyform=$target.' '.$referrer;
$keyform=~ s#<#\<\;#og; # prevent accidents with '<'
$keyform=~ s#>#\>\;#og; # prevent accidents with '>'
$keyform=~ s#\\&\;#og; # prevent accidents with '&'
$keyform=~ s#"#\"\;#og; # prevent accidents with '"'
$TargetCounter{$keyform}++;
$StatusFlag{$keyform}=$Status;
$keyform= $referrer.' '.$target;
$keyform=~ s#<#\<\;#og; # prevent accidents with '<'
$keyform=~ s#>#\>\;#og; # prevent accidents with '>'
$keyform=~ s#\\&\;#og; # prevent accidents with '&'
$keyform=~ s#"#\"\;#og; # prevent accidents with '"'
$SourceCounter{$keyform}++;
}
close(REFSLOG);
}
# Remove false hits caused by people using incorrect URLs
# that are redirected by the server
while(($key,$value) = each %TargetCounter) {
($target,$referrer)=split(/ /,$key,2);
if ($TargetCounter{"$target/ $referrer"}) {
$refscounter-=$TargetCounter{$key};
$TargetCounter{$key}=-1;
$SourceCounter{"$referrer $target"}=-1;
}
}
}