#!/usr/bin/perl # History # # 1.2 19970910 added forward link reporting and broken link highlighting # and chunked up code into subroutines # # 1.1.3 added optional reading of compressed log files # # 1.1.2 added blanking filter for random URL things # # 1.1.1 added optional stripping of '?' parms from target # and referring URLS # # 1.1 added command line parsing, support for multiple log formats # and inclusion of previous reports and multiple log files. Made hotlinking # mandatory, and made the exclusion patterns case-insensitive # # 1.0.1 corrected a problem with double counting hits from # people who use directoryname instead of directoryname/ to access # default html files (caused by 1.4 counting both the # original hit and the redirect), combined #anchor hits with # their base documents and removed redundant ':80' port specifications # from reported URLS. Thanks goes to richie@ljouwert.et.tudelft.nl # (http://morra.et.tudelft.nl/~richie/) for these suggested improvements. # # Also, the report is now sorted by number of hits as well as by # target page and the ability to exclude particular pages from # the report has been added, both as refering pages and as target pages. $version="1.2.1"; # For decompressing logs, if necessary $ungzip = "/bin/gzip -cd "; $unbzip2 = "/usr/bin/bzip2 -cd "; # Exclusion patterns (uncomment to use) $EXCLUDEREFSTO='\.gif|\.jpg'; #$EXCLUDEREFSFROM='^http:\/\/www.devilbunnies.org|\-$'; # URL blanking filters for targets $BLANKTARGET ='/chat.cgi/.*$'; $BLANKTARGETREPLACE = '/chat.cgi'; # $BLANKTARGET ='(\/\d+\-\d+\-menu.asis|\/\d+\-\d+.asis|\?newsgroup.*)$'; # $BLANKTARGETREPLACE = '(multiple)'; # URL blanking filters for referrer $BLANKREFER ='/chat.cgi/.*$'; $BLANKREFERREPLACE = '/chat.cgi'; # $BLANKREFER ='(\/\d+\-\d+\-menu.asis|\/\d+\-\d+.asis|\?newsgroup.*)$'; # $BLANKREFERREPLACE = '(multiple)'; # Strip '?' variable parameters from target URLs $STRIPTARGETPARMS=1; # Strip '?' variable parameters from referring URLs $STRIPREFPARMS=0; # default path to the referer_log $RefererLog="/www/common/logs/nihongo/access_log"; # Name of server $HTTPDSERVER='www.nihongo.org'; # Minimum number of references for a ref to be included in the list $MinRefs=2; # Log type (combined, multihost-combined, referer) $LogType="combined"; # Roxen hack - for 400 series status codes, get the referrer from the # RFC931 field because the Roxen server does stupid stuff $roxenhack=0; # Print the Broken Links Report $PrintBrokenLinks=1; # Print the Reverse Link Report $PrintReverseLinks=1; # Print the Forward Link Report $PrintForwardLinks=1; # Restrict a multihost-combined analysis to webhosts matching the following regex $Only_Specified_Host = '^(www\.)?nihongo\.org'; # Include or don't include webhost in local page target URL $Include_Webhost_In_Target = 0; # Protocal to assume for local page target URLs if webhost is included in the local target URL $Assumed_Protocal = 'http'; &Initialize; &IncludeOldFiles($IncludeFile) if ($IncludeFile); @ARGV = ("$RefererLog") if ($#ARGV == -1); &ReadLogFiles(@ARGV); if ($OutputFile) { open (OUTPUTFILE,">$OutputFile") || die ("Could not open $OutputFile for writing.\n$!"); binmode OUTPUTFILE; select(OUTPUTFILE); } print <<"EOF"; Referring URL Statistics for ${HTTPDSERVER}

Referring URL Statistics for ${HTTPDSERVER}

$refscounter references examined this run.
Last updated: EOF print scalar(localtime); print <<"EOF";

This is a report of what URL browsers reported as the referring URL that directed them to a particular web page here. Inaccuracies are due mainly to some browsers reporting wrong information under some conditions.

EOF if ($MinRefs > 1) { print "Only referring URLs with at least $MinRefs reports were included ", "in this list.\n

\n"; } print "

\n\n\n\n\n
\n\n
\n
\n"; &DoBrokenLinks if ($PrintBrokenLinks); &DoForwardLinks if ($PrintForwardLinks); &DoReverseLinks if ($PrintReverseLinks); print <<"EOF";
Statistics generated by RefStats $version / snowhare\@nihongo.org
EOF select(STDOUT); # All Done. Everything after this is subroutines. # Print all the broken links sub DoBrokenLinks { print <<"EOF";

Broken Links:

EOF $LastWebPage=''; foreach $key (sort bytargetthenhits keys(%TargetCounter)) { next if ($StatusFlag{$key} != 404); ($target,$referrer)=split(/ /,$key,2); if ("$target" ne "$LastWebPage") { print "
$target
\n"; } print "
$referrer", " ($TargetCounter{$key} reference"; if ($TargetCounter{$key} > 1) { print "s"; } if ($StatusFlag{$key} == 404) { print ', Broken Link'; } print ")
\n"; $LastWebPage=$target; } print <<"EOF";
Return To Top

EOF } # Print all the reverse links sub DoReverseLinks { local ($Status,$LastWebPage,$key,$target,$referrer); print <<"EOF";

Reverse Links:

$StartReverse EOF $LastWebPage=''; foreach $key (sort bytargetthenhits keys(%TargetCounter)) { ($target,$referrer)=split(/ /,$key,2); next if ($TargetCounter{$key} < $MinRefs); if ("$target" ne "$LastWebPage") { print "
$target
\n"; } print "
$referrer", " ($TargetCounter{$key} reference"; if ($TargetCounter{$key} > 1) { print "s"; } $Status=$StatusFlag{$key}; if ($Status == 404) { print ', Broken Link'; } print ")
\n"; $LastWebPage=$target; } print <<"EOF"; $EndReverse
Return To Top

EOF } # Print all the forward links sub DoForwardLinks { local ($Status,$LastWebPage,$key,$referrer,$target); print <<"EOF";

Forward Links

$StartForward EOF $LastWebPage=''; foreach $key (sort bysourcethenhits keys(%SourceCounter)) { ($referrer,$target)=split(/ /,$key,2); next if ($SourceCounter{$key} < $MinRefs); if ("$referrer" ne "$LastWebPage") { print "
$referrer
\n"; } print "
$target ($SourceCounter{$key} reference"; if ($SourceCounter{$key} > 1) { print "s"; } $Status=$StatusFlag{"$target $referrer"}; if ($Status == 404) { print ', Broken Link'; } print ")
\n"; $LastWebPage=$referrer; } print <<"EOF"; $EndForward
Return To Top

EOF } # for sorting the reverse report sub bytargetthenhits { ($targeta)=($a=~m#^(.+)\s#o); ($targetb)=($b=~m#^(.+)\s#o); $inequality=($targeta cmp $targetb); if ($inequality) { $inequality; } else { $TargetCounter{$b}<=>$TargetCounter{$a}; } } # for sorting the forward report sub bysourcethenhits { ($sourcea)=($a=~m#^(.+)\s#o); ($sourceb)=($b=~m#^(.+)\s#o); $inequality=($sourcea cmp $sourceb); if ($inequality) { $inequality; } else { $SourceCounter{$b}<=>$SourceCounter{$a}; } } # Merge old reports with this one. Is order of operation dependant # - you can end up with bogus 'Status' codes if you merge new # to old instead of old to new. sub IncludeOldFiles { local (@IncludeFilesList)=@_; local ($target,$source,$line,$Section,$GotData,$IncludeFile); foreach $IncludeFile (@IncludeFilesList) { if (! open (INCLUSION,$IncludeFile)) { warn "Could not open $IncludeFile for inclusion.\n$!"; return; } binmode INCLUSION; $Section=0; while ($line = ) { chop $line; # The funky if statements are so you # can include or omit everything except the # broken links report and still get your data # The bailout is so the order of the reports # can be changed without breaking the code if ($line eq $StartForward) { $Section=1; next; } if ($line eq $EndForward) { $Section=0; last; } if ($line eq $StartReverse) { $Section=2; next; } if ($line eq $EndReverse) { $Section=0; last; } next if (! $Section); if ($Section == 2) { # Reverse if ($line =~ m#^
(.*)
#o) { $target = $1; next; } next if (! $target); if ($line=~ m#^
.*\s*\((\d+)\s*reference.*\).*
$#oi){ $TargetCounter{"$target $1"} += $2; $SourceCounter{"$1 $target"} += $2; $refscounter += $2; $StatusFlag{"$target $1"} = $3; } } if ($Section == 1) { # Forward if ($line =~ m#^
.*
#o) { $source = $1; next; } next if (! $source); if ($line=~ m#^
(\S*)\s+\((\d+)\s*reference.*\).*
$#oi){ $TargetCounter{"$1 $source"} += $2; $SourceCounter{"$source $1"} += $2; $refscounter += $2; $StatusFlag{"$1 $source"} = $3; } } } close(INCLUSION); } } sub Initialize { &ReadCommandLine('include:output:name:logtype:exfrom:exto:minrefs:stripref:striptarget'); # Old file to read (optional) $IncludeFile = $opt{'include'} if ($opt{'include'}); # Output file (optional - goes to STDOUT if not specified) $OutputFile = $opt{'output'} if ($opt{'output'}); # Type of log (legal: combined, referer) $LogType = $opt{'logtype'} if ($opt{'logtype'}); # Exclusion patterns $EXCLUDEREFSFROM= $opt{'exfrom'} if ($opt{'exfrom'}); $EXCLUDEREFSTO= $opt{'exto'} if ($opt{'exto'}); # '?' parameter stripping options $STRIPTARGETPARMS = $opt{'striptarget'} if (defined($opt{'striptarget'})); $STRIPREFPARMS = $opt{'strip'} if (defined($opt{'stripref'})); # Name of server $HTTPDSERVER=$opt{'name'} if ($opt{'name'}); # Minimum number of references for a reference to be included in the list $MinRefs = $opt{'minrefs'} if ($opt{'minrefs'}); # Markers for the various log sections $StartForward=""; $EndForward =""; $StartReverse=""; $EndReverse =""; $refscounter=0; } sub ReadCommandLine { # parse list has the form 'a:b:c' # flags with parse list entries must take values local($parselist)=$_[0]; local(@CommandLine)=@ARGV; local(@ParseList,%ParseRules,@GenericList); (@ParseList)=split(/:/,$parselist); foreach $item (@ParseList) { $ParseRules{$item}=1; } while ($parm=shift(@CommandLine)) { if ($parm =~ m#^\-([a-zA-Z]+)$#o) { $parm=$1; $opt{$parm}=1; if ($ParseRules{$parm}) { $value=shift(@CommandLine); if ($value eq "") { die ("Invalid comand line switch usage, '-$parm' requires value\n"); } $opt{$parm}=$value; } next; } push(@GenericList,$parm); } @ARGV=@GenericList; } sub ReadLogFiles { local (@LogFilesList)=@_; local ($Domain,$rfc931,$authuser,$TimeDate,$Request, $Status,$Bytes,$referrer,$Agent,$Method,$target, $Protocal,$LogFile,$OpeFile,$line,$keyform,$key, $value); foreach $LogFile (@LogFilesList) { $OpeFile=$LogFile; if ($LogFile =~ m#\.gz$#o) { $OpeFile="$ungzip $LogFile |"; } elsif ($LogFile =~ m#\.bz2#) { $OpeFile = "$unbzip2 $LogFile |"; } if (! open(REFSLOG,$OpeFile) ) { warn "Can't open $LogFile. Skipped.\n $!"; next; } binmode REFSLOG; while($line=) { $refscounter++; chop $line; if ($LogType eq 'combined') { ($Domain,$rfc931,$authuser,$TimeDate,$Request,$Status,$Bytes,$referrer,$Agent) = $line =~ /^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+) \"?([^"]*)\"? \"([^"]*)\"/o; ($Method,$target,$Protocal)=split(/\s/,$Request,3); } elsif ($LogType eq 'multihost-combined') { ($webhost, $Domain,$rfc931,$authuser,$TimeDate,$Request,$Status,$Bytes,$referrer,$Agent) = $line =~ /^(\S+) (\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+) \"?([^"]*)\"? \"([^"]*)\"/o; ($Method,$target,$Protocal)=split(/\s/,$Request,3); next if ($Only_Specified_Host and $webhost !~ m/$Only_Specified_Host/o); } elsif ($LogType eq 'referer') { ($referrer,$target)=split(/ -> /,$line,2); } $referrer =~ s/^\-$//o; $target =~ s/^\-$//o; if (! ($referrer && $target) ) { next if (! $roxenhack); ($Domain,$referrer,$authuser,$TimeDate,$Request,$Status,$Bytes) = $line =~ /^(\S+) \"?([^"]*)\"? (\S+) \[([^\]\[]+)\] \"([^"]*)\" (\S+) (\S+)/o; ($Method,$target,$Protocal)=split(/\s/,$Request,3); $referrer =~ s/^\-$//o; $target =~ s/^\-$//o; next if (! ($referrer && $target)); } $target=~ s/\%7[eE]/~/o; # Caniconalize %7E and %7e as ~ $target=~ s#//#/#go; # Remove any extra slashes $target=~ s#^ *$#/#o; # fix root ref if needed $target=~ s/#.+$//o; # combine #anchor refs with root doc $target=~ s/\?.*$//o if $STRIPTARGETPARMS; # strip '?' parameters $target = "$Assumed_Protocal://$webhost$target" if $Include_Webhost_In_Target; if ($BLANKTARGET) { $target =~ s/$BLANKTARGET/$BLANKTARGETREPLACE/o; } next if ($EXCLUDEREFSTO && ($target=~m#$EXCLUDEREFSTO#io)); $referrer=~ s/\%7[eE]/~/o; # Caniconalize %7E and %7e as ~ $referrer=~ s#^(http://[^/]+):80/#$1/#o; # remove unneeded :80 port specification $referrer=~ s/#.+$//o; # combine #anchor refs with root doc $referrer=~ s/\?.*$//o if $STRIPREFPARMS; # strip '?' parameters if ($BLANKREFER) { $referrer =~ s/$BLANKREFER/$BLANKREFERREPLACE/o; } next if ($EXCLUDEREFSFROM && ($referrer=~m#$EXCLUDEREFSFROM#oi)); $keyform=$target.' '.$referrer; $keyform=~ s#<#\<\;#og; # prevent accidents with '<' $keyform=~ s#>#\>\;#og; # prevent accidents with '>' $keyform=~ s#\&#\&\;#og; # prevent accidents with '&' $keyform=~ s#"#\"\;#og; # prevent accidents with '"' $TargetCounter{$keyform}++; $StatusFlag{$keyform}=$Status; $keyform= $referrer.' '.$target; $keyform=~ s#<#\<\;#og; # prevent accidents with '<' $keyform=~ s#>#\>\;#og; # prevent accidents with '>' $keyform=~ s#\&#\&\;#og; # prevent accidents with '&' $keyform=~ s#"#\"\;#og; # prevent accidents with '"' $SourceCounter{$keyform}++; } close(REFSLOG); } # Remove false hits caused by people using incorrect URLs # that are redirected by the server while(($key,$value) = each %TargetCounter) { ($target,$referrer)=split(/ /,$key,2); if ($TargetCounter{"$target/ $referrer"}) { $refscounter-=$TargetCounter{$key}; $TargetCounter{$key}=-1; $SourceCounter{"$referrer $target"}=-1; } } }