#!/usr/bin/perl
#
# INSTITUT FUER INFORMATIK
# der Ludwig-Maximilians Universitaet Muenchen
#
# FAKULTAET FUER INFORMATIK
# der Technischen Universitaet Muenchen
#
# Fortgeschrittenenpraktikum
# --------------------------
# Implementierung eines Analysewerkzeuges fuer
# Logdateien von WWW-Servern bei der BMW-AG
#
# Peter Kai Wimmer
# Gabelsbergerstr. 28/IV
# 80333 Muenchen
# Tel./Fax: (089) 523 72 65
# E-Mail: peter@leo.org
# wimmer@informatik.tu-muenchen.de
#
# Urwald is Bloedsinn, Ernie, ich geh' nach Hause (Bert, Sesamstrasse)
#
$false = 0;
$true = 1;
#
# standard log file name
#
$logfile = "httpd-log.all";
#
# filter types
#
# global variables with corresponding name and meaning
# must exist
#
@filterTypes = ('host', 'logname', 'authuser', 'method', 'protocol',
'domain', 'topdomain', 'dir', 'file', 'param', 'browser',
'status', 'bytes', 'date', 'time');
#
# list types
#
@listTypes = ('hosts', 'lognames', 'authusers', 'methods', 'protocols',
'domains', 'topdomains', 'dirs', 'files', 'params', 'browsers');
#
# count types
#
@countTypes = ('bytes', 'requests');
#
# Flags
#
#
# use regular expressions?
#
$regexp = $false;
#
# print lines verbosely?
#
$verbose = $false;
#
# print logline (false) or list/count (true) ?
#
$printExtended = $false;
#
# sort alphabetically (default; false) or numerically (true)?
#
$sortNum = $false;
#
# Log file format: Common (false) or Netscape (true)?
#
$netscape = $false;
#
# Map months to numbers
#
%monthName = (
'Jan', 1,
'Feb', 2,
'Mar', 3,
'Apr', 4,
'May', 5,
'Jun', 6,
'Jul', 7,
'Aug', 8,
'Sep', 9,
'Oct', 10,
'Nov', 11,
'Dec', 12,
);
#
# check if a batch file is used
# read parameters from that file
#
foreach $arg (@ARGV)
{
local ($batchfile, @batchline);
if ($arg =~ /^@.*/)
{
$batchfile = substr ($arg,1);
open(BATCHFILE, $batchfile) || die "$0: Can't open $batchfile\n";
-T $batchfile || die "$0: $batchfile: not a batchfile\n";
while($batchline = <BATCHFILE>)
{
($batchline =~ /^#/) && next;
@batchline = split (' ', $batchline);
foreach $arg (@batchline)
{
push(@argv, $arg);
print "$arg\n";
}
}
}
else
{
push(@argv, $arg);
}
}
#
# Main Loop
#
MAIN: while(@argv)
{
local ($arg, $type, $outputType);
$arg = shift (@argv);
($arg eq '-help') && (&help) && exit;
($arg eq '-l') && ($logfile = shift (@argv)) && next;
($arg eq '-verbose') && ($verbose = $true) && next;
($arg eq '-sort') && ($sortNum = $true) && next;
($arg eq '-netscape') && ($netscape = $true) && next;
($arg eq '-regexp') && ($regexp = $true) && next;
#
# Filter
#
foreach (@filterTypes)
{
if ($arg =~ $_)
{
$type = substr($arg,1);
$arg = shift (@argv);
while (($arg !~ /^-.*/) && ($arg ne ""))
{
# use shell-style expressions
# instead of regular expressions?
#
if (!$regexp)
{
# . -> \.
$arg =~ s|\.|\\\.|g;
# * -> .*
$arg =~ s|\*|\.\*|g;
# ? -> .
$arg =~ s|\?|\.|g;
# [^ -> [\^
$arg =~ s|\[\^|\[\\\^|g;
}
if ($type eq 'status')
{
($arg eq 'ok') && ($arg = '20.');
($arg eq 'error') && ($arg = '40. 50.');
}
$filter{$type} .= $arg . ' ';
$arg = shift (@argv);
}
if ($type !~ /bytes|date|time/)
{
$prg_filter .= <<'EOF';
($TYPE !~ /^FILTER$/) && return scalar($false);
EOF
$filtertype = $filter{$type};
chop $filtertype;
$filtertype =~ s/ /\$|^/g;
# escape '/' for regular expr.
$filtertype =~ s|/|\\/|g;
$prg_filter =~ s/TYPE/$type/;
$prg_filter =~ s/FILTER/$filtertype/;
}
unshift(@argv, $arg);
next MAIN;
}
}
#
# Output
#
if ($arg =~ /-count|-list/)
{
# disable line-by-line listing
$printExtended = $true;
# 'list' or 'count'
$outputType = substr($arg,1);
$arg = shift (@argv);
while (($arg !~ /^-.*/) && ($arg ne ""))
{
$output{$outputType} .= $arg . ' ';
#
# list
#
if ($outputType eq "list")
{
if (!grep (/^$arg$/, @listTypes))
{
print "$0: Unknown list type -- $arg\n";
die "$0: Try '$0 -help' for more information.\n";
}
# cut off trailing 's'
chop $arg;
# the following $prg_filter results in (e.g.)
# $list{'hosts'} .= $host . ' ';
$prg_account{'list'} .= <<'EOF';
$list{'TYPEs'} .= $TYPE . ' ';
EOF
$prg_account{'list'} =~ s/TYPE/$arg/g;
}
#
# count
#
if ($outputType eq "count")
{
if (!grep (/^$arg$/, @countTypes))
{
print "$0: Unknown count type -- $arg\n";
die "$0: Try '$0 -help' for more information.\n";
}
}
$arg = shift (@argv);
}
unshift(@argv, $arg);
next MAIN;
}
#
# Error
#
if ($arg ne "")
{
print "$0: Illegal option -- $arg\n";
die "$0: Try '$0 -help' for more information.\n";
}
}
# for debugging: code strings
#
# print"Filter: $prg_filter";
# print"List: $prg_account{'list'}";
open(LOGFILE, $logfile) || die "$0: Can't open $logfile\n";
-T $logfile || die "$0: $logfile: not a logfile\n";
while($logline = <LOGFILE>)
{
# two lines from Netscape server
($netscape) && (chop $logline) && ($logline .= ' ' . <LOGFILE>);
&parse;
if (&filter)
{
&account;
&out;
}
}
close LOGFILE;
foreach (@listTypes)
{
($list{$_}) && (&printArray (&countItems($list{$_})));
}
foreach (@countTypes)
{
($count{$_}) && (print "$_ $count{$_}\n");
}
#
# Parse
#
sub parse
{
# if you use '/' as delimiter for RegExp, you need to
# escape it, since '/' also occurs in $path
#
# Parse $logline
# parse $request: $file, $params
#
if ($netscape)
{
($host, $logname, $authuser, $date, $request, $status, $bytes, $domain, $browser, $os) =
$logline =~ /^(.*) (.*) (.*) \[(.*)\] \"(.*)\" (.*) (.*) (.*) (.*) \((.*)\)$/;
($method, $path, $httpVersion) =
$request =~ m|^(\w+) /(.*) HTTP/(.*)$|;
}
else
{
($host, $logname, $authuser, $date, $request, $status, $bytes) =
$logline =~ /^(.*) (.*) (.*) \[(.*)\] \"(.*)\" (.*) (.*)$/;
($method, $protocol, $path, $httpVersion) =
$request =~ m|^(\w+) (\w+)://(.*) HTTP/(.*)$|;
}
#
# Date, Time (e.g. 29/Sep/1995:13:13:11)
#
($day, $month, $year, $hour, $minute, $second) =
$date =~ m|^(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)|;
#
# exchange name of month with number
#
$month = $monthName{$month};
#
# parse $path: ($domain,) $dir, $file, $param
#
($path, $param) =
$path =~ /([^\?]*)\??([^\?]*)$/;
if ($netscape)
{
($dir, $file) =
$path =~ m|^(.*/)?([^/]*)$|;
}
else
{
($domain, $dir, $file) =
$path =~ m|^([^/]+)/(.*/)?([^/]*)$|;
}
$dir = '/' . $dir;
#
# parse $domain: $topdomain, $port
#
($topdomain, $port) =
$domain =~ /.*\.([^:]*)?:?(\d+)?$/;
}
#
# Filter
#
sub filter
{
local ($filter, $type);
local ($min, $max);
local (@filterRange, $range);
# perl versions 4 and 5 seem to interpret
# 'eval' differently;
# eval $prg_filter; # works for version 4, but f**k version 5
((eval $prg_filter) eq '0') && return $false;
#
# Bytes
#
if ($filter{'bytes'})
{
@filterRange = split(' ', $filter{'bytes'});
foreach $range (@filterRange)
{
($min, $max) =
$range =~ /(\d*)-(\d*)/;
($bytes < $min) && return $false;
($max) && ($bytes > $max) && return $false;
}
}
#
# date
#
if ($filter{'date'})
{
@filterRange = split(' ', $filter{'date'});
foreach $range (@filterRange)
{
($min, $max) =
$range =~ m|([\d/]+)-?([\d/]*)|;
($minDay, $minMonth, $minYear) =
($maxDay, $maxMonth, $maxYear) =
$min =~ m|^(\d+)/(\d+)/(\d+)$|;
if ($max)
{
($maxDay, $maxMonth, $maxYear) =
$max =~ m|(\d+)/(\d+)/(\d+)|;
}
(($year < $minYear)
|| (($year == $minYear) && ($month < $minMonth))
|| (($year == $minYear) && ($month == $minMonth) && ($day < $minDay)))
&& return $false;
(($year > $maxYear)
|| (($year == $maxYear) && ($month > $maxMonth))
|| (($year == $maxYear) && ($month == $maxMonth) && ($day > $maxDay)))
&& return $false;
}
}
#
# time
#
if ($filter{'time'})
{
@filterRange = split(' ', $filter{'time'});
foreach $range (@filterRange)
{
($min, $max) =
$range =~ m|([\d:]+)-?([\d:]*)|;
($minHour, $minMin, $minSec) =
($maxHour, $maxMin, $maxSec) =
$min =~ m|^(\d+):(\d+):(\d+)$|;
if ($max)
{
($maxHour, $maxMin, $maxSec) =
$max =~ m|(\d+):(\d+):(\d+)|;
}
(($hour < $minHour)
|| (($hour == $minHour) && ($minute < $minMin))
|| (($hour == $minHour) && ($minute == $minMin) && ($second < $minSecond)))
&& return $false;
(($hour > $maxHour)
|| (($hour == $maxHour) && ($minute > $maxMin))
|| (($hour == $maxHour) && ($minute == $maxMin) && ($second > $maxSecond)))
&& return $false;
}
}
return $true;
}
#
# Accounting
#
sub account
{
local (@cntTypes);
#
# List
#
eval $prg_account{'list'};
#
# Count
#
@cntTypes = split (' ', $output{'count'});
(grep (/^bytes$/, @cntTypes)) && ($count{'bytes'} += $bytes);
(grep (/^requests$/, @cntTypes)) && ($count{'requests'}++);
}
#
# Count items in an array
#
sub countItems
{
local ($list) = @_;
local (@array, @keys, @resultArray);
local (%count);
@array = sort(split(' ', $list));
# count occurences
# Programming Perl, p.254
for (@array)
{
$count{$_}++;
}
# sort alphabetically
# Programming Perl, p. 235
for (reverse sort keys %count)
{
push (@resultArray, ($_ . ' ' . $count{$_}));
}
# sort numerically
# Programming Perl, p. 249
if ($sortNum)
{
foreach (@resultArray)
{
push (@keys, (split(/ /))[1] );
}
}
# subroutine for sorting numerically
sub byNumber { $keys[$a] <=> $keys[$b]; }
@sortArray = @resultArray[reverse sort byNumber $[..$#resultArray];
}
#
# Print an array
#
# Programming Perl, p. 230
# this is preferred to print join("\n", @array)
# since the following code is more efficient for
# a very long array and more space-conservative
#
sub printArray
{
local (@array) = @_;
local ($,, $\) = ("\n", "\n");
print @array;
}
#
# Output a logline
#
sub out
{
($printExtended) && return;
if ($verbose)
{
print "\nHost: $host\n";
print "Logname: $logname\n";
print "Authuser: $authuser\n";
print "Date: $date\n";
# request
print "Method: $method\n";
(!$netscape) && print "Protocol: $protocol\n";
# path
print "Domain: $domain\n";
print "topDom: $topdomain\n";
($port) && print "Port: $port\n";
print "Dir: $dir\n";
print "File: $file\n";
($param) && print "Param: $param\n";
print "HTTP-Ver: $httpVersion\n";
print "Status: $status\n";
print "Bytes: $bytes\n";
if ($netscape)
{
print "Browser: $browser\n";
print "Operating System: $os\n";
}
}
else
{
print $logline;
}
}
#
# Help
#
sub help
{
print <<EOF;
WWW log file analyzer 1.0
Peter Kai Wimmer (peter\@leo.org)
-l logfile
-netscape logfile is in NetScape format (default: common log)
-regexp use regular expressions instead of shell-style expr.
-sort sort -list output by number
Filter:
-host
-logname wimmer
-authuser
-date dd/mm/yyyy-dd/mm/yyyy 01/10/1995-30/10/1995
-time hh:mm:ss-hh:mm:ss 06:00:00-07:30:00
-method GET, POST, ...
-protocol http, ftp, ...
-domain www.ibm.com, ...
-topdomain de, com, edu, ...
-dir
-file
-param
-status 303, 20*, ok, error
-bytes xxxxx-yyyyy
-browser
Output:
-count bytes | requests
-list hosts | lognames | authusers | methods | protocols |
domains | topdomains | dirs | files | params | browsers
without '-count' or '-list': logline itself
-verbose print verbosely
'\@' preceding a file name reads parameters from a file (e.g. '\@args')
EOF
}