Next: Index Up: No Title Previous: Abkürzungen, Begriffe
Listing des Perl-Skriptes

#!/usr/bin/perl 
#

#          INSTITUT FUER INFORMATIK
# der Ludwig-Maximilians Universitaet Muenchen
#
#          FAKULTAET FUER INFORMATIK
#    der Technischen Universitaet Muenchen
#
#          Fortgeschrittenenpraktikum
#          --------------------------
# Implementierung eines Analysewerkzeuges fuer
#  Logdateien von WWW-Servern bei der BMW-AG 
#
# Peter Kai Wimmer
# Gabelsbergerstr. 28/IV
# 80333 Muenchen
# Tel./Fax: (089) 523 72 65
# E-Mail: peter@leo.org
#         wimmer@informatik.tu-muenchen.de
#

# Urwald is Bloedsinn, Ernie, ich geh' nach Hause (Bert, Sesamstrasse)
#


$false = 0;
$true  = 1;


#
# standard log file name
#
$logfile = "httpd-log.all";

#
# filter types
#
# global variables with corresponding name and meaning
# must exist
#
@filterTypes = ('host',  'logname',  'authuser',  'method',  'protocol',
                'domain',  'topdomain',  'dir',  'file', 'param', 'browser',
                'status', 'bytes', 'date', 'time');

#
# list types
#
@listTypes = ('hosts', 'lognames', 'authusers', 'methods', 'protocols',
              'domains', 'topdomains', 'dirs', 'files', 'params', 'browsers');

#
# count types
#
@countTypes =  ('bytes', 'requests');

#
# Flags 
#

#
# use regular expressions?
#
$regexp = $false;

#
# print lines verbosely?
#
$verbose = $false;

#
# print logline (false) or list/count (true) ?
#
$printExtended = $false;

#
# sort alphabetically (default; false) or numerically (true)?
#
$sortNum = $false;

#
# Log file format: Common (false) or Netscape (true)?
#
$netscape = $false;

#
# Map months to numbers
#
%monthName = (
  'Jan', 1,
  'Feb', 2,
  'Mar', 3,
  'Apr', 4,
  'May', 5,
  'Jun', 6,
  'Jul', 7,
  'Aug', 8,
  'Sep', 9,
  'Oct', 10,
  'Nov', 11,
  'Dec', 12,
);



#
# check if a batch file is used
# read parameters from that file
#
foreach $arg (@ARGV)
{
  local ($batchfile, @batchline);

  if ($arg =~ /^@.*/)
  {
    $batchfile = substr ($arg,1);
    open(BATCHFILE, $batchfile) || die "$0: Can't open $batchfile\n";
    -T $batchfile               || die "$0: $batchfile: not a batchfile\n";

    while($batchline = <BATCHFILE>)
    {
      ($batchline =~ /^#/) && next;
      @batchline = split (' ', $batchline);
      foreach $arg (@batchline)
      {
        push(@argv, $arg);
        print "$arg\n";
      }
    }
  }
  else 
  {
    push(@argv, $arg);
  }
}



#
# Main Loop
#
MAIN: while(@argv)
{
  local ($arg, $type, $outputType);

  $arg = shift (@argv);

  ($arg eq '-help')     && (&help)                    && exit;
  ($arg eq '-l')        && ($logfile = shift (@argv)) && next;
  ($arg eq '-verbose')  && ($verbose  = $true)        && next;
  ($arg eq '-sort')     && ($sortNum  = $true)        && next;
  ($arg eq '-netscape') && ($netscape = $true)        && next;
  ($arg eq '-regexp')   && ($regexp   = $true)        && next;

  #
  #  Filter
  #
  foreach (@filterTypes)
  {
    if ($arg =~ $_)
    {
      $type = substr($arg,1);
      $arg = shift (@argv);
      while (($arg !~ /^-.*/) && ($arg ne ""))
      {
        # use shell-style expressions
        # instead of regular expressions?
        #
        if (!$regexp)
        {
          #   . -> \.
          $arg =~ s|\.|\\\.|g;
          #   * -> .*
          $arg =~ s|\*|\.\*|g;
          #   ? -> .
          $arg =~ s|\?|\.|g;
          #  [^ -> [\^
          $arg =~ s|\[\^|\[\\\^|g;
        }

        if ($type eq 'status')
        {
	  ($arg eq 'ok')    && ($arg = '20.');
          ($arg eq 'error') && ($arg = '40. 50.');
        }

        $filter{$type} .= $arg . ' ';
        $arg = shift (@argv);
      }

      if ($type !~ /bytes|date|time/)
      {
        $prg_filter .= <<'EOF';
          ($TYPE !~ /^FILTER$/) && return scalar($false);
EOF
        $filtertype = $filter{$type};
        chop $filtertype;
        $filtertype =~ s/ /\$|^/g;
        # escape '/' for regular expr.
        $filtertype =~ s|/|\\/|g;

        $prg_filter =~ s/TYPE/$type/;
        $prg_filter =~ s/FILTER/$filtertype/;
      }

      unshift(@argv, $arg);
      next MAIN;
    }
  }

  #
  #  Output
  #
  if ($arg =~ /-count|-list/)
  {
    # disable line-by-line listing
    $printExtended = $true;

    # 'list' or 'count'
    $outputType = substr($arg,1); 

    $arg = shift (@argv);

    while (($arg !~ /^-.*/) && ($arg ne ""))
    {
      $output{$outputType} .= $arg . ' ';

      #
      # list
      #
      if ($outputType eq "list")
      {
        if (!grep (/^$arg$/, @listTypes))
        {
          print "$0: Unknown list type -- $arg\n";
          die "$0: Try '$0 -help' for more information.\n";
        }

        # cut off trailing 's'
        chop $arg;

        # the following $prg_filter results in (e.g.)
        #   $list{'hosts'} .= $host . ' ';
        $prg_account{'list'} .= <<'EOF';
$list{'TYPEs'} .= $TYPE . ' ';
EOF
        $prg_account{'list'} =~ s/TYPE/$arg/g;
      }

      #
      # count
      #
      if ($outputType eq "count")
      {
        if (!grep (/^$arg$/, @countTypes))
        {
          print "$0: Unknown count type -- $arg\n";
          die "$0: Try '$0 -help' for more information.\n";
        }
      }

      $arg = shift (@argv);
    }
    unshift(@argv, $arg);
    next MAIN;
  }

  #
  # Error
  #
  if ($arg ne "")
  {
    print "$0: Illegal option -- $arg\n";
    die "$0: Try '$0 -help' for more information.\n";
  }
}

# for debugging: code strings
#
# print"Filter: $prg_filter";
# print"List:   $prg_account{'list'}";



open(LOGFILE, $logfile) || die "$0: Can't open $logfile\n";
-T $logfile             || die "$0: $logfile: not a logfile\n";

while($logline = <LOGFILE>)
{
  # two lines from Netscape server
  ($netscape) && (chop $logline) && ($logline .= ' ' . <LOGFILE>);

  &parse;
  if (&filter)
  {
    &account;
    &out;
  }
}

close LOGFILE;


foreach (@listTypes)
{
  ($list{$_}) && (&printArray (&countItems($list{$_})));
}

foreach (@countTypes)
{
  ($count{$_}) && (print "$_ $count{$_}\n");
}



#
#  Parse
#

sub parse
{
  #  if you use '/' as delimiter for RegExp, you need to
  #  escape it, since '/' also occurs in $path

  #
  # Parse $logline
  # parse $request: $file, $params
  #
  if ($netscape)
  {
    ($host, $logname, $authuser, $date, $request, $status, $bytes, $domain, $browser, $os) =
      $logline =~ /^(.*) (.*) (.*) \[(.*)\] \"(.*)\" (.*) (.*) (.*) (.*) \((.*)\)$/;

    ($method, $path, $httpVersion) =
      $request =~ m|^(\w+) /(.*) HTTP/(.*)$|;
  }
  else
  {
    ($host, $logname, $authuser, $date, $request, $status, $bytes) =
      $logline =~ /^(.*) (.*) (.*) \[(.*)\] \"(.*)\" (.*) (.*)$/;

    ($method, $protocol, $path, $httpVersion) =
      $request =~ m|^(\w+) (\w+)://(.*) HTTP/(.*)$|;
  }

  #
  #  Date, Time  (e.g. 29/Sep/1995:13:13:11)
  #
  ($day, $month, $year, $hour, $minute, $second) =
    $date =~ m|^(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)|;

  #
  # exchange name of month with number
  #
  $month = $monthName{$month};

  #
  # parse $path: ($domain,) $dir, $file, $param
  #
  ($path, $param) =
    $path =~ /([^\?]*)\??([^\?]*)$/;

  if  ($netscape)
  {
    ($dir, $file) =
      $path =~ m|^(.*/)?([^/]*)$|;
  }
  else
  {
    ($domain, $dir, $file) =
      $path =~ m|^([^/]+)/(.*/)?([^/]*)$|;
  }
  $dir = '/' . $dir;


  #
  # parse $domain: $topdomain, $port
  #
  ($topdomain, $port) =
    $domain =~ /.*\.([^:]*)?:?(\d+)?$/;
}



#
#  Filter
#

sub filter
{
  local ($filter, $type);
  local ($min, $max);
  local (@filterRange, $range);

  # perl versions 4 and 5 seem to interpret
  # 'eval' differently;
  # eval $prg_filter; # works for version 4, but f**k version 5
  ((eval $prg_filter) eq '0') && return $false;


  #
  #  Bytes
  #
  if ($filter{'bytes'})
  {
    @filterRange = split(' ', $filter{'bytes'});
    foreach $range (@filterRange)
    {
      ($min, $max) =
        $range =~ /(\d*)-(\d*)/;
      ($bytes < $min) && return $false;
      ($max) && ($bytes > $max) && return $false;
    }
  }

  #
  # date
  #
  if ($filter{'date'})
  {
    @filterRange = split(' ', $filter{'date'});
    foreach $range (@filterRange)
    {
      ($min, $max) =
        $range =~ m|([\d/]+)-?([\d/]*)|;

      ($minDay, $minMonth, $minYear) =
      ($maxDay, $maxMonth, $maxYear) =
        $min =~ m|^(\d+)/(\d+)/(\d+)$|;

      if ($max)
      {
        ($maxDay, $maxMonth, $maxYear) =
          $max =~ m|(\d+)/(\d+)/(\d+)|;
      }

      (($year < $minYear)
      || (($year == $minYear) && ($month < $minMonth))
      || (($year == $minYear) && ($month == $minMonth) && ($day < $minDay)))
      && return $false;

      (($year > $maxYear)
      || (($year == $maxYear) && ($month > $maxMonth))
      || (($year == $maxYear) && ($month == $maxMonth) && ($day > $maxDay)))
      && return $false;
    }
  }

  #
  # time
  #
  if ($filter{'time'})
  {
    @filterRange = split(' ', $filter{'time'});
    foreach $range (@filterRange)
    {
      ($min, $max) =
        $range =~ m|([\d:]+)-?([\d:]*)|;

      ($minHour, $minMin, $minSec) =
      ($maxHour, $maxMin, $maxSec) =
        $min =~ m|^(\d+):(\d+):(\d+)$|;

      if ($max)
      {
        ($maxHour, $maxMin, $maxSec) =
          $max =~ m|(\d+):(\d+):(\d+)|;
      }

      (($hour < $minHour)
      || (($hour == $minHour) && ($minute < $minMin)) 
      || (($hour == $minHour) && ($minute == $minMin) && ($second < $minSecond)))
      && return $false;

      (($hour > $maxHour)
      || (($hour == $maxHour) && ($minute > $maxMin)) 
      || (($hour == $maxHour) && ($minute == $maxMin) && ($second > $maxSecond)))
      && return $false;
    }
  }

  return $true;
}



#
#  Accounting
#

sub account
{
  local (@cntTypes);

  #
  # List
  #
  eval $prg_account{'list'};

  #
  #  Count
  #
  @cntTypes = split (' ', $output{'count'});
  (grep (/^bytes$/,    @cntTypes)) && ($count{'bytes'} += $bytes);
  (grep (/^requests$/, @cntTypes)) && ($count{'requests'}++);

}



#
#  Count items in an array
#

sub countItems
{
  local ($list) = @_;
  local (@array, @keys, @resultArray);
  local (%count);

  @array = sort(split(' ', $list));

  # count occurences
  # Programming Perl, p.254
  for (@array)
  {
    $count{$_}++;
  }

  # sort alphabetically
  # Programming Perl, p. 235
  for (reverse sort keys %count)
  {
    push (@resultArray, ($_ . ' ' . $count{$_}));
  }

  # sort numerically
  # Programming Perl, p. 249
  if ($sortNum)
  {
    foreach (@resultArray)
    {
      push (@keys, (split(/ /))[1] );
    }
  }

  # subroutine for sorting numerically
  sub byNumber { $keys[$a] <=> $keys[$b]; }
  @sortArray = @resultArray[reverse sort byNumber $[..$#resultArray];
}



#
# Print an array
#
# Programming Perl, p. 230
# this is preferred to  print join("\n", @array)
# since the following code is more efficient for
# a very long array and more space-conservative
#

sub printArray
{
  local (@array) = @_;
  local ($,, $\) = ("\n", "\n");
  print @array;
}



#
#  Output a logline
#

sub out
{
  ($printExtended) && return;

    if ($verbose)
    {
      print "\nHost:     $host\n";
      print "Logname:  $logname\n";
      print "Authuser: $authuser\n";
      print "Date:     $date\n";

      # request
      print "Method:   $method\n";
      (!$netscape) && print "Protocol: $protocol\n";

      # path
      print "Domain:   $domain\n";
      print "topDom:   $topdomain\n";
      ($port)  && print "Port:     $port\n";
      print "Dir:      $dir\n";
      print "File:     $file\n";
      ($param) && print "Param:    $param\n";

      print "HTTP-Ver: $httpVersion\n";

      print "Status:   $status\n";
      print "Bytes:    $bytes\n";

      if ($netscape)
      {
        print "Browser:          $browser\n";
        print "Operating System: $os\n";
      }
    }
    else
    {
      print $logline;
    }
}



#
# Help
#

sub help
{
  print <<EOF;

WWW log file analyzer 1.0
Peter Kai Wimmer (peter\@leo.org)

 -l         logfile
 -netscape  logfile is in NetScape format (default: common log)
 -regexp    use regular expressions instead of shell-style expr.
 -sort      sort -list output by number

Filter:
 -host 
 -logname                           wimmer
 -authuser
 -date      dd/mm/yyyy-dd/mm/yyyy   01/10/1995-30/10/1995
 -time      hh:mm:ss-hh:mm:ss       06:00:00-07:30:00
 -method                            GET, POST, ...
 -protocol                          http, ftp, ...
 -domain                            www.ibm.com, ...
 -topdomain                         de, com, edu, ...
 -dir 
 -file 
 -param
 -status                            303, 20*, ok, error
 -bytes     xxxxx-yyyyy
 -browser

Output:
 -count     bytes | requests
 -list      hosts | lognames | authusers | methods | protocols | 
            domains | topdomains | dirs | files | params | browsers
without '-count' or '-list': logline itself
 -verbose   print verbosely

'\@' preceding a file name reads parameters from a file  (e.g. '\@args')

EOF
}