#!/usr/bin/perl -w package LogAnalysis::FavIconTracker; ############################################################################ # Version 1.0 of the FavIconTracker log analysis program. # Copyright 1999 Benjamin Franz # # You may freely copy and distribute this software so long # as all documentation and this notice is included. # # If you wish to modify and distribute this software, feel free. # You MAY NOT charge for distributing it. If you modify it, # please make sure you document it (and if it is a good hack # let me know - I might include it in a future release). # # The most current release can always be found at # # # THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS # OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A # PARTICULAR PURPOSE. # # Use of this software in any way or in any form, source or binary, # is not allowed in any country which prohibits disclaimers of any # implied warranties of merchantability or fitness for a particular # purpose or any disclaimers of a similar nature. # # IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, # SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE # USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT # LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE ############################################################################# =head1 NAME LogAnalysis::FavIconTracker - A package for generating a report of 'favicon.ico' bookmarks from a web log. =head1 SYNOPSIS =head1 DESCRIPTION Microsoft has added a feature to their Internet Explorer 5 web browser that allows web sites to place icons for display with bookmarks. As a side-effect of this, hits are generated in the web server logs that can be analyzed to determine that a bookmark has been set and for what section of the site. This script performs a fast analysis (in testing it was able to process a 55 megabyte access_log in 23 seconds) of those bookmarks and generates a report of what is being bookmarked, and how often and what is the most popularly bookmarked areas of a web site. The defaults are set in the 'new' method and most options can be overridden from the command line. It has only been tested under Perl 5.005, but should work fine under any version of Perl 5. =head1 CHANGES 1.0.2 13 Jul 1999 - Fixed missing 'Jul' for report generation. 1.0.1 05 May 1999 - Fixed postfix detection for compressed files. =head2 Initialization =cut use strict; use Carp; use Getopt::Long; my $VERSION="FavIconTracker 1.0.2"; my $analyzer = LogAnalysis::FavIconTracker->new; $analyzer->read_command_line; $analyzer->analyze; ############################################################################# =over 4 =item C Initializes the LogAnalysis::FavIconTracker object for use and sets the default values for various options. =back =cut sub new { my ($class) = shift; my $self = bless {},$class; $self->declare('-compression_programs','-system_name','-log_type','-log_format', '-log_files','-default_pages','-body_tag','-head_tags', '-output_file','-include_old','-results','-log_field_to_name', '-name_to_log_field','-start_date','-end_date','-output', '-log_regex','-run_time_start', ); # You will probably want to change the following defaults # to match your own system configuration $self->set({ -compression_programs => { 'gz' => '/usr/bin/gzip -cd', # Program to decompress files ending with .gz 'Z' => '/bin/zcat', # Program to decompress files ending with .Z 'bz2' => '/usr/bin/bzip2 -cd', # Program to decompress files ending with .bz2 }, -system_name => 'www.nihongo.org', # Name of your system -log_type => 'common', # common = WWW common log format # combined = NCSA combined log format -log_format => '', # If you are using the Apache Custom Log Module, # you can put your LogFormat here. I suggest # stripping any conditional markup, ie: # %400,500{referer}i should be %{referer}i -log_files => ['/www/common/logs/nihongo/access_log'], # Default location(s) of the log file # *YOU WILL HAVE TO CHANGE THIS TO MATCH YOUR SYSTEM* -default_pages => { 'index.html' => '', , # Used to merge "/" references with "index.html" 'index.htm' => '', , # Used to merge "/" references with "index.htm" }, -body_tag => '', -head_tags => qq ( ), -output_file => '/www/servers/nihongo.org/snowhare/utilities/favicontracker/example/index.html', # File to output report to -include_old => ['/www/servers/nihongo.org/snowhare/utilities/favicontracker/example/index.html'], # Old report to have data included in a new report }); if ($#_ > -1) { $self->set(@_); } $self; } ############################################################################# =over 4 =item C Reads the command line for program options. Allowed options are 'include', 'ouput', 'body_tag', 'head', 'system_name', 'log_type' and 'log_format'. All parameters left on the command line after parsing the parms named here are treated as the names of log files to be analyzed. --include=filename This option allows the inclusion of the data from one or more previously generated reports into a new report. You can include more than one old report by using multipe --include==filename command line entries. This option overrides the '-include_old' array in the default settings. --output=filename This allows the override of the output file for the report. It overrides the '-output' setting in the defaults. It can only occur ONCE. --body_tag='' This overrides the default 'BODY' tag for the document as set by the '-body_tag' default. --head='text' Overrides the '-head_tags' default, allowing you to insert text into the HEAD of the document (like META's or LINKs). --system_name='text' Overrides the '-system_name' default, setting the text to be placed at the top of the report and for the page TITLE. --log_type='common' Overrides the '-log_type' system default. Only legal values are 'common' or 'combined'. --log_format Overrides the '-log_format' system default with an Apache derived custom log format, with some additions. This also overrides the '-log_type' preemptively. Use only if you are a wizard at log formats and Perl. =back =cut sub read_command_line { my ($self) = shift; my $opt_control = {}; Getopt::Long::Configure('permute'); my $result = GetOptions($opt_control, "include=s@", "output=s", "body_tag=s", "head=s", "system_name=s", "log_type", "log_format", ); # Overridden Log files my @log_files = @ARGV; if ($#log_files > -1) { $self->set({ -log_files => [@log_files] }); } # Overridden included old reports if (defined $opt_control->{'include'}) { $self->set({ -include_old => $opt_control->{'include'} }); } # Overridden system name for report if (defined $opt_control->{'system_name'}) { $self->set({ -system_name => $opt_control->{'system_name'} }); } # Overridden BODY tag if (defined $opt_control->{'body_tag'}) { $self->set({ -body_tag => $opt_control->{'body_tag'} }); } # Overridden output file if (defined $opt_control->{'output'}) { $self->set({ -output_file => $opt_control->{'output'} }); } # Overridden text for insertion into HEAD of document if (defined $opt_control->{'head'}) { $self->set({ -head_tags => $opt_control->{'head_tag'} }); } # Overridden log type ('combined' or 'common') if (defined $opt_control->{'log_type'}) { $self->set({ -log_type => $opt_control->{'log_type'} }); } # Overridden log format (set the 'parse_log_format' method for details) if (defined $opt_control->{'log_format'}) { $self->set({ -log_format => $opt_control->{'log_format'} }); } } ############################################################################# =over 4 =item C Read old reports, new logs and analyze the resulting data. If an '-output_file' has been set, saves the report to the named file. The HTML report is also returned as the return value of the method. =back =cut sub analyze { my ($self) = shift; if ($#_ > -1) { $self->set(@_); } $self->set({ -results => { -urls => {}, }, -start_date => '99999999999999', -end_date => '00000000000000', -output => '', -log_regex => '', -run_time_start => time, }); $self->read_old_reports; $self->read_new_logs; $self->make_report; my ($output) = $self->get('-output'); $output; } ############################################################################# =over 4 =item C Reads a web server log and extracts all hits on 'favicon.ico' files to determine bookmarks being set. Data is stored in the '-results' object parameter as an anonymous hash. $results->{-urls}->{$url}->{-last_date} = most recent bookmark date $results->{-urls}->{$url}->{-count} = number of bookmarks; The list of log files to be read is taken from the '-log_files' object parameter. Compressed log files can be read. =back =cut sub read_new_logs { my ($self) = shift; $self->parse_log_format; my ($results,$new_log_files,$compression_programs, $log_field_to_name,$name_to_log_field, $start_date,$end_date,$log_regex,$log_format, $default_page_names) = $self->get('-results','-log_files','-compression_programs', '-log_field_to_name','-name_to_log_field', '-start_date','-end_date','-log_regex','-log_format', '-default_pages', ); my ($timedate_field) = $name_to_log_field->{'timedate'}; my ($url_field) = $name_to_log_field->{'url'}; my ($status_field) = $name_to_log_field->{'status'}; if (not (defined ($timedate_field) and defined ($url_field))) { die ("Specified log format of '$log_format' can not be parsed for dates and URLs\n"); } my $default_page_match = ''; foreach my $match (keys %{$default_page_names}) { $default_page_match .= '|' . quotemeta($match); } $default_page_match =~ s/^\|+//o; if ($default_page_match ne '') { $default_page_match = "($default_page_match)\$"; } my $page_match_pattern = qr /$default_page_match/; my %month_name_to_number = ( Jan => '01', Feb => '02', Mar => '03', Apr => '04', May => '05', Jun => '06', Jul => '07', Aug => '08', Sep => '09', Oct => '10', Nov => '11', Dec => '12', ); foreach my $log_file (@$new_log_files) { # Handle compressed logs my ($post_fix) = $log_file =~ m/\.([^.]+)$/o; if (defined ($post_fix) and ($post_fix ne '') and defined ($compression_programs->{$post_fix})) { $log_file = "$compression_programs->{$post_fix} $log_file |"; } if (not open (LOGFILE,$log_file)) { warn ("Unable to open log file '$log_file' ($!), skipped file.\n"); next; } my (@elements) = (); my ($timedate,$status,$url,$day,$month,$month_name,$year,$hour,$minute,$second,$date); my ($home_hits) = 0; my ($non_home_hits) = 0; while () { next if (not m/favicon\.ico /o); # Speed hack. Boosts performance about 4x. chop; next if (not (@elements = m/$log_regex/)); $url = $elements[$url_field]; $status = $elements[$status_field]; next if ($url !~ m#/favicon\.ico$#o); # We are only looking for MSIE 5 bookmarks if ($url =~ m#^/favicon.ico$#o) { $home_hits++; } else { $non_home_hits++; if ($status >= 400) { $home_hits--; # MS is going to try in the home dir since it didn't get an icon here. } } $url =~ s#/favicon\.ico$##o; # Strip the '/favicon.ico' from the end of the URL $url = '/' if ($url eq ''); # Insert '/' if we lost it. $url =~ s#%7[eE]#~#go; # Canonicalize %7[Ee] as '~' $url =~ s#\?.*$##o; # Strip parms $url =~ s/#.*$//o; # Strip anchors $url =~ s/$page_match_pattern/$default_page_names->{$1}/e if ($default_page_match ne ''); # Merge default pages $timedate = $elements[$timedate_field]; ($day,$month_name,$year,$hour,$minute,$second) = $timedate =~ m#^\[(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d)#o; $date = "$year$month_name_to_number{$month_name}$day$hour$minute$second"; $start_date = $date if ($date lt $start_date); $end_date = $date if ($date gt $end_date); if (not (defined ($results->{-urls}->{$url}->{-last_date})) or ($date gt $results->{-urls}->{$url}->{-last_date})) { $results->{-urls}->{$url}->{-last_date} = $date; } $results->{-urls}->{$url}->{-count}++; } if ($home_hits < 1) { $home_hits = 0; } if ($home_hits) { $results->{-urls}->{'/'}->{-count} = $home_hits; } elsif (exists ($results->{-urls}->{'/'})) { delete $results->{-urls}->{'/'}; } close (LOGFILE); } $self->set({ -results => $results, -end_date => $end_date, -start_date => $start_date, }); } ############################################################################# =over 4 =item C Reads report files generated by FavIconTracker and adds any bookmark date from the report(s) into the data set (stored in '-results'). The list of old reports to be read is taken from '-include_old'. The reader can handle compressed files. =back =cut sub read_old_reports { my ($self) = shift; my ($results,$include_reports,$compression_programs, $start_date,$end_date) = $self->get('-results','-include_old','-compression_programs', '-start_date','-end_date'); my %month_name_to_number = ( Jan => '01', Feb => '02', Mar => '03', Apr => '04', May => '05', Jun => '06', Jul => '07', Aug => '08', Sep => '09', Oct => '10', Nov => '11', Dec => '12', ); foreach my $file (@$include_reports) { # Handle compressed files my ($post_fix) = $file =~ m/\.([^.]+)$/o; if (defined ($post_fix) and ($post_fix ne '') and defined ($compression_programs->{$post_fix})) { $file = "$compression_programs->{$post_fix} $file |"; } if (not open (FILE,$file)) { warn ("Unable to open report file '$file' ($!), skipped file.\n"); next; } my $section = ''; while () { next if (not m//o); $section = $1; last; } next if ($section eq ''); my ($date,$hour,$minute,$second,$day,$month_name,$year,$count,$url); while () { chop; last if (m#^$#o); ($hour,$minute,$second,$day,$month_name,$year,$count,$url) = m/^(\d\d):(\d\d):(\d\d)\s+(\d+)\s+(\w\w\w)\s+(\d\d\d\d)\s+(\d+)\s+(\S+)$/o; next if (not defined $url); $hour = "0$hour" if (length($hour) < 2); $minute = "0$minute" if (length($minute) < 2); $second = "0$hour" if (length($second) < 2); $day = "0$day" if (length($day) < 2); $date = "$year$month_name_to_number{$month_name}$day$hour$minute$second"; $start_date = $date if ($date lt $start_date); $end_date = $date if ($date gt $end_date); if ( not (defined ($results->{-urls}->{$url}->{-last_date})) or ($date gt $results->{-urls}->{$url}->{-last_date})) { $results->{-urls}->{$url}->{-last_date} = $date; } $count =~ s/,//og; $results->{-urls}->{$url}->{-count} += $count; } close (LOGFILE); } $self->set({ -results => $results, -end_date => $end_date, -start_date => $start_date, }); } ############################################################################# =over 4 =item C Generates a report from the data acquired in the read_new_logis and/or read_old_reports methods. If an '-ouput_file' has been set, outputs the report to that file. Also returns the HTML output as its return value. =back =cut sub make_report { my ($self) = shift; my ($output_file,$body_tag,$head_tags, $system_name,$results,$start_date, $end_date,$run_time_start) = $self->get('-output_file','-body_tag','-head_tags','-system_name', '-results','-start_date','-end_date','-run_time_start'); my ($output_list) = {}; my @month_names = ('','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'); my (@url_list) = sort {lc ($a) cmp lc ($b)} keys %{$results->{-urls}}; my (%output_list,$url,$esc_url,$bookmarked_entries,$bookmarked_last_date,$year,$month,$day,$hour,$min,$sec,$last_date); foreach $url (@url_list) { $esc_url = $url; $esc_url =~ s/\&/\&/o; $esc_url =~ s/\/\>/o; $esc_url =~ s/\"/\"/o; $bookmarked_entries = $results->{-urls}->{$url}->{-count}; 1 while ($bookmarked_entries =~ s/(.*\d)(\d\d\d)/$1,$2/o); # inserts commas into number $bookmarked_last_date = $results->{-urls}->{$url}->{-last_date}; ($year,$month,$day,$hour,$min,$sec) = $bookmarked_last_date =~ m/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/o; $last_date = "$hour:$min:$sec $day $month_names[$month] $year"; $output_list{$url} = sprintf("%s %10s %s\n",$last_date,$bookmarked_entries,$esc_url); } $system_name =~ s/\&/\&/o; $system_name =~ s/\/\>/o; $system_name =~ s/\"/\"/o; # Figure the run time for the script my $run_time = time - $run_time_start; my $minutes = int ($run_time / 60); my $seconds = int ($run_time % 60); if ($seconds != 1) { $run_time = "$seconds seconds"; } else { $run_time = "$seconds second"; } if ($minutes > 1) { if ($seconds > 0) { $run_time = "$minutes minutes $run_time"; } else { $run_time = "$minutes minutes"; } } elsif ($minutes == 1) { if ($seconds > 0) { $run_time = "$minutes minute $run_time"; } else { $run_time = "$minutes minute"; } } # Figure today's date my $today = scalar localtime time; my $output =<<"EOF"; $system_name $head_tags $body_tag

$system_name

Index

Generated On: $today
Run Time: $run_time

EOF ########## # By URL # ########## $output .=<<"EOF";

Bookmark Entries Sorted By Page

Last On                  Number  Page
EOF
    foreach my $item (@url_list) {
        $output .= $output_list{$item};
    }
    $output .= "
\n"; ##################### # By Bookmark Count # ##################### $output .=<<"EOF";

Bookmark Entries Sorted By Number

Last On                  Number  Page
EOF

    foreach my $item (sort { $results->{-urls}->{$b}->{-count} <=> $results->{-urls}->{$a}->{-count} }  @url_list) {
        $output .= $output_list{$item};
    }
    $output .= "
\n"; $output .=<<"EOF";

Created using $VERSION

EOF # Save the report $self->set({ -output => $output }); if ($output_file) { if (not open (OUTPUTFILE,">$output_file")) { warn ("Unable to open output file '$output_file' for writing: $!\n"); } else { print OUTPUTFILE $output; close (OUTPUTFILE); } } $output; } ########################################################################## =over 4 =item C Parses the '-log_type' and/or '-log_format' to determine the correct regular expression (set into '-log_regex') to analyze the log file. commentary text stolen from mod_log_config.c in the Apache distribution with some additions The argument to LogFormat and CustomLog is a string, which can include literal characters copied into the log files, and '%' directives as follows: %...b: bytes sent, excluding HTTP headers. %...f: filename %...h: remote host %...{Foobar}i: The contents of Foobar: header line(s) in the request sent to the client. %...L: The HTTP protocal level identifier %...l: remote logname (from identd, if supplied) %...{Foobar}n: The contents of note "Foobar" from another module. %...{Foobar}o: The contents of Foobar: header line(s) in the reply. %...m The HTTP method used %...p: the port the request was served to %...P: the process ID of the child that serviced the request. %...r: first line of request %...s: status. For requests that got internally redirected, this is status of the *original* request --- %...>s for the last. %...t: time, in common log format time format %...{format}t: The time, in the form given by format, which should be in strftime(3) format. %...T: the time taken to serve the request, in seconds. %...u: remote user (from auth; may be bogus if return status (%s) is 401) %...U: the URL path requested. %...v: the name of the server (i.e. which virtual host?) The '...' can be nothing at all (e.g. "%h %u %r %s %b"), or it can indicate conditions for inclusion of the item (which will cause it to be replaced with '-' if the condition is not met). Note that there is no escaping performed on the strings from %r, %...i and %...o; some with long memories may remember that I thought this was a bad idea, once upon a time, and I am still not comfortable with it, but it is difficult to see how to "do the right thing" with all of '%..i', unless we URL-escape everything and break with CLF. The forms of condition are a list of HTTP status codes, which may or may not be preceded by '!'. Thus, '%400,501{User-agent}i' logs User-agent: on 400 errors and 501 errors (Bad Request, Not Implemented) only; '%!200,304,302{Referer}i' logs Referer: on all requests which did *not* return some sort of normal status. The default LogFormat reproduces CLF; see below. The way this is supposed to work with virtual hosts is as follows: a virtual host can have its own LogFormat, or its own TransferLog. If it doesn't have its own LogFormat, it inherits from the main server. If it doesn't have its own TransferLog, it writes to the same descriptor (meaning the same process for "| ..."). =cut sub parse_log_format { my ($self) = shift; my ($log_format,$log_type) = $self->get('-log_format','-log_type'); my %log_format_regex_patterns = ( '%b' => '\d+|-', # Number of bytes sent '%h' => '[a-zA-Z0-9-.]+', # Remote host '%f' => '\S+', # Filename '%{referer}i' => '\S*', # Referrer '%{user-agent}i' => '[^"]*', # User Agent '%i' => '[^"]*', # Header junk '%l' => '\S+', # remote logname (identd if provided) '%p' => '\d+', # Port the request was server to '%P' => '\d+', # Process ID of the child that serviced the request '%r' => '[^"]*', # First line of request '%s' => '\d\d\d', # Status Code '%t' => '\[\d\d/\w\w\w/\d\d\d\d:\d\d:\d\d\:\d\d\s[+-]?\d\d\d\d]', # Date/Time stamp in CLF format '%T' => '\d', # Time taken to serve the request in seconds '%u' => '\S+', # Remote user (authentication) '%U' => '\S+', # URL path requested '%v' => '[a-zA-Z0-9.-]+', # Server name '%m' => '\S+', # HTTP protocal method '%L' => '\S+', # HTTP protocal level ); my %log_format_identifiers = ( '%b' => 'bytes', '%h' => 'remotehost', # Remote host '%f' => 'filename', # Filename '%{referer}i' => 'referrer', # Referrer '%{user-agent}i' => 'useragent', # User Agent '%i' => '[^"]*', # Header junk '%l' => 'rfc931', # remote logname (identd if provided) '%p' => 'port', # Port the request was server to '%P' => 'process', # Process ID of the child that serviced the request '%r' => 'request', # First line of request '%s' => 'status', # Status Code '%t' => 'timedate', # Date/Time stamp in CLF format '%T' => 'servicetime', # Time taken to serve the request in seconds '%u' => 'authuser', # Remote user (authentication) '%U' => 'url', # URL path requested '%v' => 'server', # Server name '%m' => 'method', # HTTP method '%L' => 'protocal' # HTTP protocal level ); # [10/Aug/1996:09:55:25 -0700] <-CLF format time if (($log_format eq '') and ($log_type =~ m#^common$#oi)) { $log_format = '%h %l %u %t \"%m %U %L\" %s %b'; } elsif (($log_format eq '') and ($log_type =~ m#^combined$#oi)) { $log_format = '%h %l %u %t \"%m %U %L\" %s %b \"?%{referer}i\"? \"%{user-agent}i\"'; } my $match_format = "\^$log_format"; # Not terminated to add some forgiveness for wrong formats # We are going to assume that everything is seperated by white space my @log_elements = split(/\s+/,$log_format); my $log_field_to_name = {}; my $name_to_log_field = {}; for (my $count=0;$count <= $#log_elements;$count++) { my $ident_item = $log_elements[$count]; $ident_item =~ s#[^%}{a-zA-Z-]##og; if (defined $log_format_regex_patterns{$ident_item}) { my $match_item = quotemeta ($ident_item); my $match_pattern = $log_format_regex_patterns{$ident_item}; $match_format =~ s/$match_item/\($match_pattern\)/; $log_field_to_name->{$count} = $log_format_identifiers{$ident_item}; $name_to_log_field->{$log_format_identifiers{$ident_item}} = $count; } } $self->set({ -log_field_to_name => $log_field_to_name, -name_to_log_field => $name_to_log_field, -log_regex => qr /$match_format/, }); } ############################################################################ =over 4 =item C Declares one or more parameters for use. =back =cut sub declare { my $self = shift; my (@parmnames) = @_; my $parmname; foreach $parmname (@parmnames) { $parmname = lc ($parmname); $self->{-legal_parms}->{$parmname} = 1; } } ############################################################################ =over 4 =item C Sets one or more named parameter values. =back =cut sub set { my $self = shift; my ($parm_ref) = @_; my (@parmnames) = keys %$parm_ref; my $parmname; foreach $parmname (@parmnames) { my $keyname = lc ($parmname); my $value = $parm_ref->{$parmname}; confess ("Attempted to set an undeclared named parameter: $keyname") if (not exists $self->{-legal_parms}->{$keyname}); $self->{-parm_values}->{$keyname} = $value; } } ############################################################################ =over 4 =item C Gets one or more named parameter values. Returned as an array if more than one requested. =back =cut sub get { my $self = shift; my (@results) = (); my (@parmnames) = @_; my $parmname; foreach $parmname (@parmnames) { my $keyname = lc ($parmname); confess ("Attempted to get an undeclared named parameter: $keyname") if (not exists $self->{-legal_parms}->{$keyname}); confess ("Attempted to get an uninitialized named parameter: $keyname") if (not exists $self->{-parm_values}->{$keyname}); push (@results,$self->{-parm_values}->{$keyname}); } return @results; } ############################################################################ =head1 AUTHOR Benjamin Franz =head1 TODO Documentation. Integration. =cut 1;