#! /usr/bin/perl -w

# vim:syntax=perl

use strict;
use lib '/usr/share/perl5';

package Lire::Proxy::MS_ISALog;

use base qw/Lire::W3CExtendedLog/;

use Lire::DlfSchema;
use Lire::W3CExtendedLog;
use Lire::Program qw( :msg );
use Lire::Utils qw/parse_url/;


my $debug = 0;
sub debug {
    $debug and lr_debug($_[0]);
}


my $schema = Lire::DlfSchema::load_schema( "proxy" );

my %ms_type2regex = (
    # MS enhanced string, using tab as field separator
    ms_string => '([^\t]+)',
);

my %ms_identifier2type = (
    # MS workaround
    'username'       => 'ms_string',
    'agent'          => 'ms_string',
    'referred'       => 'name',
    'host'           => 'name',
    'port'           => 'integer',
    'computername'   => 'uri',
    'object-source'  => 'uri',
    'operation'      => 'uri',
    'protocol'       => 'uri',
    'mime-type'	     => 'uri',
    'rule#1'	     => 'uri',
    'rule#2'	     => 'uri',
);

# see http://www.w3.org/TR/WD-logfile.html
my %isa_field2proxy_dlf =
  (
    'c-ip'           => 'client_ip',
    'c-host'	     => 'client_host',
    'cs-username'    => 'user',

    'c-agent'        => 'useragent',
    # e.g. Mozilla/4.0 (compatible; MSIE 5.0; Win32)
    # or Outlook Express/5.0 (MSIE 5.0; Windows 98; DigExt)

    # 's-computername' => 'computername',
    # Proxy name (s-computername) The name of the computer running ISA
    # Server. This is the computer name that is assigned in Windows
    # 2000. e.g. GRO1SYX01 
    # Currently unmapped

    # 'cs-referred'    => 'result_src_host',
    # Referring server name (cs-referred): If ISA Server is used upstream in
    # a chained configuration, this indicates the server name of the
    # downstream server that sent the request.
    # Unmapped for now.

    'r-host'         => 'result_src_host',
    'r-ip'           => 'result_src_ip',
    'r-port'         => 'result_src_port',
    # For the Web Proxy service, a hyphen (-) in this field may
    # indicate that an object was sourced from the Web Proxy server
    # cache and not from the destination. One exception is negative
    # caching. In that case, this field indicates a destination IP
    # address for which a negative-cached object was returned.

    # 'sc-bytes'       => 'bytes',
    # We'll add up cs-bytes and sc-bytes later on

    'sc-status'	     => 'req_result',

    'cs-protocol'    => 'protocol',

    's-operation'    => 'operation',

    #'cs-uri'         => ,
    # This will be parsed for dst_host and requested_url

    # 's-object-source' => 'result_src_code',
    # Object source (s-object-source) Indicates the source that was used to 
    # retrieve the current object. This field applies only to the Web Proxy
    # service log.
    # This overlaps the cache_result and result_src_result

    'time-taken'     => 'duration',
    # This is in millisecond, we need to postprocess it

    'rule#1'	     => 'rule',
    'rule#2'	     => 'rule',
    # Rule#1 is about protocol rule
    # Rule#2 is about content filtering
    # Only one of the two should be defined in each record

    'cs-mime-type'   => 'type',
  );

# values and their meanings:
# 0           No source information is available.
# Cache       Source is the cache. Object returned from cache.
# Inet        Source is the Internet. Object added to cache.
# Member      Returned from another array member.
# NotModified Source is the cache. Client performed an If-Modified-Since
#              request and object had not been modified.
# NVCache     Source is the cache. Object could not be verified to source.
# Upstream    Object returned from an upstream proxy cache.
# Vcache      Source is the cache. Object was verified to source and had
#              not been modified. 
# VFInet      Source is the Internet. Cached object was verified to source
#              and had been modified.
my %ms_isa2cache_result =
  (
   0	    => 'NONE',
   Cache    => 'TCP_HIT',
   Inet	    => 'TCP_MISS',
   Member   => 'TCP_MISS',
   NotModified => 'TCP_IMS_HIT',
   NVCache  => 'TCP_REF_FAIL_HIT',
   Upstream => 'TCP_MISS',
   VCache   => 'TCP_REFRESH_HIT',
   VFInet   => 'TCP_REFRESH_MISS',
  );

my %ms_isa2result_src_code =
  (
   0	    => 'NONE',
   Cache    => 'NONE',
   Inet	    => 'DIRECT',
   Member   => 'SIBLING_HIT',
   NotModified => 'NONE',
   NVCache  => 'NONE',
   Upstream => 'PARENT_HIT',
   VCache   => 'NONE',
   VFInet   => 'DIRECT',
  );

sub build_parser {
    my ( $self ) = shift;

    $self->{field2re} = %ms_type2regex;

    for my $k (keys %ms_identifier2type) {
        if (defined $ms_identifier2type{$k}) {
            # possibly overwrite Lire::W3CExtendedLog values
            $self->{identifier2type}->{$k} = $ms_identifier2type{$k};
        } else {
            if (defined $self->{identifier2type}->{$k}) {
                lr_warn( "ms_identifier2type undefined in '$k', keeping W3C " .
                  "default '" . $self->{identifier2type}->{$k} .
                  "' for identifier2type" );
            } else {
                lr_warn( "ms_identifier2type undefined in '$k', " .
                  "identifier2type stays undefined" );
            }
        }
    }

    $debug and do {
        debug( "just filled identifier2type hash, dumping it" );
        while ((my $k, my $v) = each %{ $self->{identifier2type} }) {
            if (defined $v) {
                debug( "identifier2type{'$k'} = '$v'" );
            } else {
                debug( "identifier2type{'$k'} undefined" );
            }
        }
        debug( "... done" );
    };

    # Override some types' lexer
    while ( my ( $type, $rx ) = each %ms_type2regex ) {
	$self->{type2regex}{$type} = $rx;
    }

    $self->SUPER::build_parser( @_ );

    my @fields = split /\s+/, $self->{fields};
    my %fields = map { $_ => 1 } @fields;

    my @mapped   = ();
    my @dlf_fields;
    foreach my $f ( @fields ) {
        if ( exists $isa_field2proxy_dlf{$f} ) {
            push @mapped, $f;
            push @dlf_fields, $isa_field2proxy_dlf{$f};
        }
    }

    # Create the DLF maker function
    push @dlf_fields, "time" if $fields{time};
    push @dlf_fields, "client_host" if $fields{'c-ip'};
    push @dlf_fields, "bytes" if $fields{'cs-bytes'} || $fields{'sc-bytes'};
    push @dlf_fields, "dst_host", "requested_url", "dst_port", "protocol"
      if $fields{'cs-uri'};
    push @dlf_fields, "cache_result", "result_src_code"
      if $fields{'s-object-source'};

    # Keep only one of each
    my %dlf_fields = map { $_ => 1 } @dlf_fields;
    @dlf_fields = sort keys %dlf_fields;

    lr_info( "mapped DLF fields: ", join( ", ", @dlf_fields ) );

    my $dlf_maker = $schema->make_hashref2asciidlf_func( @dlf_fields );

    $self->{proxy_dlf_converter} = sub {
        my $w3c = $self->{w3c_parser}->( $_[0] );

        # Those fields that are mapped directly
        my %dlf = ( time => $w3c->{lire_time} );
	$dlf{bytes} = 0;
	$dlf{bytes} += $w3c->{'cs-bytes'}
	  if $w3c->{'cs-bytes'} && $w3c->{'cs-bytes'} ne '-';
	$dlf{bytes} += $w3c->{'sc-bytes'}
	  if $w3c->{'sc-bytes'} && $w3c->{'sc-bytes'} ne '-';

	if ( exists $w3c->{'cs-uri'}) {
	    my $url = parse_url( $w3c->{'cs-uri'} );
	    $dlf{'dst_host'}	    = $url->{'host'};
	    $dlf{'requested_url'}   = $url->{'path'};
	    $dlf{'dst_port'}	    = $url->{'port'};
	    $dlf{'protocol'}	    = $url->{'protocol'};
	    # That last one may be overriden by cs-protocol
	}

	if ( exists $w3c->{'s-object-source'} ) {
	    my $code = $w3c->{'s-object-source'};
	    if ( $code eq '-' ) {
		if ( exists $w3c->{'sc-status'} &&
		     $w3c->{'sc-status'} == 403 )
		{
		    # Special case for denied requests
		    $dlf{cache_result}	    = 'TCP_DENIED';
		    $dlf{result_src_code}   = 'NONE';
		} else {
		    $dlf{cache_result}	    = 'NONE';
		    $dlf{result_src_code}   = 'NONE';
		}
	    } else {
		$dlf{cache_result}    = $ms_isa2cache_result{$code};
		$dlf{result_src_code} = $ms_isa2result_src_code{$code};

		$dlf{cache_result} = 'TCP_NEGATIVE_HIT'
		  if  exists $dlf{req_result} && $dlf{req_result} == 404 &&
		   $dlf{cache_result} eq 'TCP_HIT';
	    }
	}

        foreach my $name ( @mapped ) {
	    # Map field only if it has the non-default value '-'
            $dlf{$isa_field2proxy_dlf{$name}} = $w3c->{$name}
	      if $w3c->{$name} ne '-';
        }

	$dlf{duration} = sprintf '%.3f', $dlf{duration} / 1_000
	  if exists $dlf{duration};

	# Put client_ip into client_host when it is missing.
	$dlf{client_host} ||= $dlf{client_ip};

        return $dlf_maker->( \%dlf );
    }
}




package main;

use Lire::Program qw( :msg :dlf );

my $lines       = 0;
my $dlflines    = 0;
my $errorlines  = 0;

init_dlf_converter( "proxy" );

my $parser = new Lire::Proxy::MS_ISALog;

# Parse the header
my $line;
while (defined( $line = <> )) {
    last unless $line =~ /^#/;
    $parser->parse_directive( $line );
}

lr_err( "invalid W3C extended log file: must start by Version and Fields " .
    "directives" ) unless defined $parser->{fields} &&
    defined $parser->{version};

my $todlf = $parser->{proxy_dlf_converter};

# Transform into DLF
do {
    $lines++;

    if ( $line =~ /^#/ ) {
        eval {
            $parser->parse_directive( $line );
        };
        if ( $@ ) {
            lr_err( $@ );
            $errorlines++;
            last;
        }
    } else {
        eval {
            my $dlf = $todlf->( $line );
            print join( " ", @$dlf), "\n";
            $dlflines++;
        };
        if ($@) {
            lr_warn( $@ );
            lr_notice( qq{cannot convert line $. "$line" to proxy dlf, skipping} );
        }
    }
    $line = <>;
} while (defined $line);

end_dlf_converter( $lines, $dlflines, $errorlines );

exit 0;


__END__

=pod

=head1 NAME

ms_isa2dlf - convert Microsoft ISA server logs to DLF

=head1 SYNOPSIS

B<ms_isa2dlf> B<[>I<file>B<]>

=head1 DESCRIPTION

B<ms_isa2dlf> converts Microsoft Internet Security and Acceleration Server log
files in the W3C Extended Log Format to the proxy DLF.  The ISA log files are
documented on the section on "Firewall and Web Proxy log fields" in the
document "Microsoft Internet Security and Acceleration Server Enterprise
Edition" at http://technet.microsoft.com/en-us/library/cc723430.aspx.

=head1 DEBUGGING

As any Lire 2dlf program, this program needs adjusted LR_DBDIR, LR_DBFILE,
LR_ID and PATH variables.  These are set in .../etc/lire/defaults and
.../etc/lire/profile_lean.  After manually source-ing these files, one can run
this program as a standalone application, by invoking it as e.g.

 zcat ms_isa.log.gz | LR_ID=`date +%Y%m%d.%H%M%S` ./ms_isa2dlf > /tmp/dlf

.

=head1 EXAMPLES

To process a log as produced by the Microsoft ISA Server:

 $ ms_isa2dlf < ms_isa.log

ms_isa2dlf will be rarely used on its own, but is more likely
called by lr_log2report:

 $ lr_log2report ms_isa < /var/log/ms_isa.log

=head1 THANKS

Chainsaw on OPN irc, for supplying log files.

=head1 SEE ALSO

w3c_extended2dlf(1)

=head1 VERSION

$Id: ms_isa2dlf.in,v 1.16 2008/11/19 12:16:05 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2001 Stichting LogReport Foundation LogReport@LogReport.org

This program is part of Lire.

Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html.

=head1 AUTHOR

Joost van Baal <joostvb@logreport.org>, heavily inspired by Francis J.
Lacoste's w3c_extended2dlf(1)

=cut

# Local Variables:
# mode: cperl
# End:



