#! /usr/bin/perl
#
# gatherstats.pl
#
# This script will gather statistical information from a database
# containing headers and other information from a INN feed.
#
# It is part of the NewsStats package.
#
# Copyright (c) 2010-2013, 2025 Thomas Hochstein <thh@thh.name>
#
# It can be redistributed and/or modified under the same terms under
# which Perl itself is published.

BEGIN {
  use File::Basename;
  # we're in .../bin, so our module is in ../lib
  push(@INC, dirname($0).'/../lib');
}
use strict;
use warnings;

use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList ParseHeaders);

use DBI;
use Encode qw(decode encode);
use Getopt::Long qw(GetOptions);
Getopt::Long::config ('bundling');

################################# Definitions ##################################

# define types of information that can be gathered
# all / groups (/ clients / hosts)
my %LegalStats;
@LegalStats{('all','groups','hosts','clients')} = ();

################################# Main program #################################

### read commandline options
my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
    $OptHostsDB,$OptMID,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,
    $OptConfFile);
GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
            'clientsdb=s'     => \$OptClientsDB,
            'd|debug+'        => \$OptDebug,
            'groupsdb=s'      => \$OptGroupsDB,
            'hierarchy=s'     => \$OptTLH,
            'hostsdb=s'       => \$OptHostsDB,
            'mid=s'           => \$OptMID,
            'm|month=s'       => \$OptMonth,
            'rawdb=s'         => \$OptRawDB,
            's|stats=s'       => \$OptStatsType,
            't|test!'         => \$OptTest,
            'conffile=s'      => \$OptConfFile,
            'h|help'          => \&ShowPOD,
            'V|version'       => \&ShowVersion) or exit 1;

### read configuration
my %Conf = %{ReadConfig($OptConfFile)};

### override configuration via commandline options
my %ConfOverride;
$ConfOverride{'DBTableRaw'}   = $OptRawDB if $OptRawDB;
$ConfOverride{'DBTableGrps'}  = $OptGroupsDB if $OptGroupsDB;
$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
&OverrideConfig(\%Conf,\%ConfOverride);

# set --debug and --test if --mid is set
if ($OptMID) {
  $OptDebug = 1; $OptTest = 1;
}

### get type of information to gather, defaulting to 'all'
$OptStatsType = 'all' if !$OptStatsType;
&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
  if !exists($LegalStats{$OptStatsType});

### get time period from --month
# get verbal description of time period, drop SQL code
my ($Period) = &GetTimePeriod($OptMonth);
# bail out if --month is invalid or set to 'ALL';
# we don't support the latter
&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
         "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');

### init database
my $DBHandle   = InitDB(\%Conf,1);
my $DBRaw      = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
my $DBGrps     = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
my $DBHosts    = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableHosts'});
my $DBClients  = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableClnts'});

### get data for each month
&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
foreach my $Month (&ListMonth($Period)) {

  print "---------- $Month ----------\n" if $OptDebug;

  ### GroupStats
  if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
    ### reformat $Conf{'TLH'}
    my $TLH;
    if ($Conf{'TLH'}) {
      # $Conf{'TLH'} is parsed as an array by Config::Auto;
      # make a flat list again, separated by :
      if (ref($Conf{'TLH'}) eq 'ARRAY') {
        $TLH = join(':',@{$Conf{'TLH'}});
      } else {
        $TLH  = $Conf{'TLH'};
      }
      # strip whitespace
      $TLH =~ s/\s//g;
      # add trailing dots if none are present yet
      # (using negative look-behind assertions)
      $TLH =~ s/(?<!\.):/.:/g;
      $TLH =~ s/(?<!\.)$/./;
      # check for illegal characters
      &Bleat(2,'Config error - illegal characters in TLH definition!')
        if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
      # escape dots
      $TLH =~ s/\./\\./g;
      if ($TLH =~ /:/) {
        # reformat $TLH from a:b to (a)|(b),
        # e.g. replace ':' by ')|('
        $TLH =~ s/:/)|(/g;
        $TLH = '(' . $TLH . ')';
      };
    };
    &GroupStats($DBHandle,$DBRaw,$DBGrps,$Month,$TLH,$OptCheckgroupsFile,$OptMID,$OptTest,$OptDebug);
  };

  ### HostStats
  if ($OptStatsType eq 'all' or $OptStatsType eq 'hosts') {
    # define known hosts using subdomains
    my @KnownHosts = qw(abavia.com aioe.org arcor-online.net arcor-ip.de astraweb.com read.cnntp.org
                        easynews.com eternal-september.org euro.net fernuni-hagen.de free.fr newsread.freenet.ag
                        googlegroups.com heirich.name news.neostrada.pl netcologne.de newsdawg.com newscene.com
                        news-service.com octanews.com readnews.com wieslauf.sub.de highway.telekom.at
                        united-newsserver.de xennanews.com xlned.com xsnews.nl news.xs4all.nl);
    &HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@KnownHosts);
  };

  ### ClientStats
  if ($OptStatsType eq 'all' or $OptStatsType eq 'clients') {
    # define agents/clients that shouldn't be counted
    my @DropAgents = qw(debian fedora firefox gecko gentoo lightning mandriva mnenhy mozilla
                        pclinuxos perl php presto suse suse/opensuse thunderbrowse ubuntu version);
    push(@DropAgents, 'red hat');
    &ClientStats($DBHandle,$DBRaw,$DBClients,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@DropAgents);
  };
};

### close handles
$DBHandle->disconnect;

################################# Subroutines ##################################

sub GroupStats {
### ----------------------------------------------------------------------------
### collect number of postings per group
### IN : $DBHandle         : database handle
###      $DBRaw            : database table for raw data (to read from)
###      $DBGrps           : database table for groups data (to write to)
###      $Month            : current month to do
###      $TLH              : TLHs to collect
###      $Checkgroupsfile  : filename template for checkgroups file
###                          (expanded to $Checkgroupsfile-$Month)
###      $MID              : specific Message-ID to fetch (testing purposes)
###      $Test             : test mode
###      $Debug            : debug mode
### OUT: (nothing)
  my ($DBHandle,$DBRaw,$DBGrps,$Month,$TLH,$CheckgroupsFile,$MID,$Test,$Debug) = @_;

  # read list of newsgroups from --checkgroups
  # into a hash
  my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$CheckgroupsFile,$Month))}
    if $CheckgroupsFile;

  my $DBQuery;
  if (!$MID) {
    ### ----------------------------------------------
    ### get groups data (number of postings per group)
    # get groups data from raw table for given month
    $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s ".
                                             "WHERE day LIKE ? AND NOT disregard",
                                             $DBRaw));
    $DBQuery->execute($Month.'-%')
      or &Bleat(2,sprintf("Can't get groups data for %s from %s: ".
                          "$DBI::errstr\n",$Month,
                          $DBRaw));
  } else {
    $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s ".
                                             "WHERE mid = ?", $DBRaw));
    $DBQuery->execute($MID)
      or &Bleat(2,sprintf("Can't get groups data for %s from %s: ".
                          "$DBI::errstr\n",$MID,
                          $DBRaw));
  }

  # count postings per group
  my %Postings;
  while (($_) = $DBQuery->fetchrow_array) {
    # get list of newsgroups and hierarchies from Newsgroups:
    my %Newsgroups = ListNewsgroups($_,$TLH,
                                    $CheckgroupsFile ? \%ValidGroups : '');
    # count each newsgroup and hierarchy once
    foreach (sort keys %Newsgroups) {
      $Postings{$_}++;
    };
  };

  # add valid but empty groups if --checkgroups is set
  if (%ValidGroups) {
    foreach (sort keys %ValidGroups) {
      if (!defined($Postings{$_})) {
        # add current newsgroup as empty group
        $Postings{$_} = 0;
        warn (sprintf("ADDED: %s as empty group\n",$_));
        # add empty hierarchies for current newsgroup as needed
        foreach (ParseHierarchies($_)) {
          my $Hierarchy = $_ . '.ALL';
          if (!defined($Postings{$Hierarchy})) {
            $Postings{$Hierarchy} = 0;
            warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
          };
        };
      }
    };
  };

  # delete old data for that month
  if (!$Test) {
    $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
                                     $DBGrps), undef,$Month)
      or &Bleat(2,sprintf("Can't delete old groups data for %s from %s: ".
                          "$DBI::errstr\n",$Month,$DBGrps));
  };

  print "----- GroupStats -----\n" if $Debug;
  foreach my $Newsgroup (sort keys %Postings) {
    print "$Newsgroup => $Postings{$Newsgroup}\n" if $Debug;
    if (!$Test) {
      # write to database
      $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
                                            "(month,newsgroup,postings) ".
                                            "VALUES (?, ?, ?)",$DBGrps));
      $DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
        or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s: ".
                            "$DBI::errstr\n",$Month,$Newsgroup,$DBGrps));
      $DBQuery->finish;
    };
  };
};
### ----------------------------------------------------------------------------

sub HostStats {
### ----------------------------------------------------------------------------
### collect number of postings per server
### IN : $DBHandle         : database handle
###      $DBRaw            : database table for raw data (to read from)
###      $DBHosts          : database table for hosts data (to write to)
###      $Month            : current month to do
###      $TLH              : TLHs to collect
###      $MID              : specific Message-ID to fetch (testing purposes)
###      $Test             : test mode
###      $Debug            : debug mode
###      @KnownHosts       : list of known hosts with subdomains
### OUT: (nothing)
  my ($DBHandle,$DBRaw,$DBHosts,$Month,$TLH,$MID,$Test,$Debug,@KnownHosts) = @_;

  my (%Postings,$DBQuery);

  $DBQuery = GetHeaders($DBHandle,$DBRaw,$Month,$MID);

  ### ----------------------------------------------
  print "----- HostStats -----\n" if $Debug;
  ### parse headers
  while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) {
    ### skip postings with wrong TLH
    next if ($TLH && !CheckTLH($Newsgroups,$TLH));

    my $Host;
    my %Header = ParseHeaders(split(/\n/,$Headers));

    # ([a-z0-9-_]+\.[a-z0-9-_.]+) tries to match a hostname
    # Injection-Info
    if($Header{'injection-info'}) {
      ($Host) = $Header{'injection-info'} =~ /^\s*([a-z0-9-_]+\.[a-z0-9-_.]+);/i;
      # reset if IP address
      undef($Host) if $Host && $Host !~ /[g-z]/i;
    }
    # X-Trace
    if (!$Host && $Header{'x-trace'}) {
      (undef, $Host) = $Header{'x-trace'} =~ /^(\s|\d)*([a-z0-9-_]+\.[a-z0-9-_.]+)/i;
      # reset if IP address
      undef($Host) if $Host && $Host !~ /[g-z]/i;
    }
    # Path
    if (!$Host) {
      if ($Header{'path'} =~ /!([^!]+)!.POSTED!/) {
        $Host = "$1";
      } elsif ($Header{'path'} =~ /([^!]+)!.POSTED.[^!]+!?/) {
        $Host = "$1";
      } else {
        # iterate on the Path: header until we have a host name or no more
        # path elements
        while (!$Host && $Header{'path'} =~ /!/) {
          ($Host) = $Header{'path'} =~ /!?([a-z0-9-_]+\.[a-z0-9-_.]+)!!?[^!]+!?$/i;
          undef($Host) if $Host && $Host =~ /\.MISMATCH/;
          # remove last path element
          $Header{'path'} =~ s/!!?[^!]+$//;
        };
      }
    }

    # trailing .POSTED
    ($Host) = $Host =~ /(\S+)\.POSTED$/ if $Host =~ /\.POSTED$/;

    # special cases
    $Host = 'news.highwinds-media.com' if $Host =~ /f(e|x)\d\d\.\S{3}\d?$/
                                       or $Host =~ /(newsfe|fed)\d+\.(iad|ams2)$/;
    $Host = 'newshosting.com' if $Host =~ /post\d*\.iad$/;
    $Host = 'eternal-september.org' if $Host =~ /dont-email\.me$/;

    # normalize hosts
    foreach (@KnownHosts) {
      if ($Host =~ /\.$_$/) {
        ($Host) = $_ ;
        last;
      }
    }

    # count host
    if ($Host) {
      $Host = lc($Host);
      $Postings{$Host}++;
      $Postings{'ALL'}++;
    } else {
      &Bleat(1,sprintf("%s FAILED", $Header{'message-id'})) if !$Host;
    }

    printf("%s: %s\n", $Header{'message-id'}, $Host) if ($MID or $Debug && $Debug >1);
  };

  # delete old data for that month
  if (!$Test) {
    $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
                                     $DBHosts),undef,$Month)
      or &Bleat(2,sprintf("Can't delete old hosts data for %s from %s: ".
                          "$DBI::errstr\n",$Month,$DBHosts));
  };

  foreach my $Host (sort keys %Postings) {
    print "$Host => $Postings{$Host}\n" if $Debug;
    if (!$Test) {
      # write to database
      $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
                                            "(month,host,postings) ".
                                            "VALUES (?, ?, ?)",$DBHosts));
      $DBQuery->execute($Month, $Host, $Postings{$Host})
        or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s: ".
                            "$DBI::errstr\n",$Month,$Host,$DBHosts));
      $DBQuery->finish;
    };
  };
};

sub ClientStats {
### ----------------------------------------------------------------------------
### collect number of postings per client (and version)
### IN : $DBHandle         : database handle
###      $DBRaw            : database table for raw data (to read from)
###      $DBClients        : database table for clients data (to write to)
###      $Month            : current month to do
###      $TLH              : TLHs to collect
###      $MID              : specific Message-ID to fetch (testing purposes)
###      $Test             : test mode
###      $Debug            : debug mode
###      @DropAgents       : list of UserAgent "agents" that won't be counted
### OUT: (nothing)
  my ($DBHandle,$DBRaw,$DBClients,$Month,$TLH,$MID,$Test,$Debug,@DropAgents) = @_;

  my (%Postings,$DBQuery);
  my %DropAgent = map { $_ => 1 } @DropAgents;

  $DBQuery = GetHeaders($DBHandle,$DBRaw,$Month,$MID);

  ### ----------------------------------------------
  print "----- ClientStats -----\n" if $Debug;
  ### parse headers
  while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) {
    ### skip postings with wrong TLH
    next if ($TLH && !CheckTLH($Newsgroups,$TLH));

    my (@Clients, $Client, $Version);
    my %Header = ParseHeaders(split(/\n/,$Headers));

    ### X-Mailer
    if ($Header{'x-mailer'}) {
      # transfer to x-newsreader and parse from there
      $Header{'x-newsreader'} = $Header{'x-mailer'};
    }
    ### X-Newsreader
    if ($Header{'x-newsreader'}) {
      $Header{'x-newsreader'} = RemoveComments($Header{'x-newsreader'});
      # remove 'http://' and 'via' (CrossPoint)
      $Header{'x-newsreader'} =~ s/https?:\/\///;
      $Header{'x-newsreader'} =~ s/ ?via(.+)?$//;
      # parse header
      # User-Agent style
      if ($Header{'x-newsreader'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) {
        # transfer to user-agent and parse from there
        $Header{'user-agent'} = $Header{'x-newsreader'};
      # "client name version"
      } elsif ($Header{'x-newsreader'} =~ / /) {
        ($Client, $Version) = ParseXNewsreader($Header{'x-newsreader'});
      } else {
        $Client = $Header{'x-newsreader'};
        $Version = '';
      }
      if ($Client) {
        # special cases
        $Client  = 'CrossPoint'     if $Client =~ /^CrossPoint\//;
        $Client  = 'Virtual Access' if $Client =~ /^Virtual Access/;
        my %UserAgent = (agent   => $Client,
                         version => $Version);
        push @Clients, { %UserAgent };
      } else {
        $Header{'user-agent'} = $Header{'x-newsreader'};
      }
    }
    ### User-Agent
    if(!@Clients && $Header{'user-agent'}) {
      $Header{'user-agent'} = RemoveComments($Header{'user-agent'});
      ### well-formed?
      if ($Header{'user-agent'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) {
        @Clients = ParseUserAgent($Header{'user-agent'});
      } else {
        # snip and add known well-formed agents from the trailing end
        while ($Header{'user-agent'} =~ /(((Hamster)|(Hamster-Pg)|(KorrNews)|(OE-Tools)|(Mime-proxy))(\/[^\/ ]+))$/) {
          push @Clients, ParseUserAgent($1);
          $Header{'user-agent'} =~ s/ [^\/ ]+\/[^\/ ]+$//;
        }
        ### special cases
        # remove 'http://open-news-network.org'
        $Header{'user-agent'} =~ s/^https?:\/\/open-news-network.org(\S+)?//;
        # Thunderbird
        if ($Header{'user-agent'} =~ /((Mozilla[- ])?Thunderbird) ?([0-9.]+)?/) {
          $Client  = 'Thunderbird';
          $Version = $3;
        # XP
        } elsif ($Header{'user-agent'} =~ /((TrueXP|FreeXP|XP2(\/Agent)?)) \/(.+)$/) {
          $Client  = $1;
          $Version = $4;
          $Client  = 'XP2' if $Client eq 'XP2/Agent';
        ### most general case
        # client version
        # client/version
        # client/32 version
        # - version may end in one non-numeric character
        # - including trailing beta/pre/...
        # 1) client:   (([^0-9]+)|(\D+\/\d+))
        # 2) version:  (\S+\d\D?)
        # 3) trailing: (( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)?
        } elsif ($Header{'user-agent'} =~ /^(([^0-9]+)|(\D+\/\d+))[\/ ]((\S+\d\D?)(( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)?)$/) {
          $Client  = $1;
          $Version = $4;
        ### some very special cases
        # SeaMonkey/nn
        } elsif ($Header{'user-agent'} =~ /SeaMonkey\/([0-9.]+)/) {
          $Client  = 'Seamonkey';
          $Version = $1;
        # Emacs nn/Gnus nn
        } elsif ($Header{'user-agent'} =~ /Emacs [0-9.]+\/Gnus ([0-9.]+)/) {
          $Client  = 'Gnus';
          $Version = $1;
        # failed to parse
        } else {
          $Client = $Header{'user-agent'};
        }
        # count client, if found
        if ($Client) {
          my %UserAgent = (agent   => $Client,
                           version => $Version);
          push @Clients, { %UserAgent };
        } else {
          &Bleat(1,sprintf("%s FAILED", $Header{'message-id'})) if !@Clients;
        }
      }
    }

    if (@Clients) {
      $Postings{'ALL'}{'ALL'}++;
      foreach (@Clients) {
        # filter agents for User-Agent with multiple agents
        next if $#Clients && exists($DropAgent{lc($_->{'agent'})});
        # remove whitespace
        $_->{'agent'}   =~ s/^\s+|\s+$//g;
        $_->{'version'} =~ s/^\s+|\s+$//g if $_->{'version'};
        # encode to utf-8, if necessary
        $_->{'agent'}   = encode('UTF-8', $_->{'agent'})   if $_->{'agent'} =~ /[\x80-\x{ffff}]/;
        $_->{'version'} = encode('UTF-8', $_->{'version'}) if $_->{'version'} and $_->{'version'} =~ /[\x80-\x{ffff}]/;
        # truncate overlong clients or versions
        $_->{'agent'}   = substr($_->{'agent'}, 0, 150)  if length($_->{'agent'}) > 150;
        $_->{'version'} = substr($_->{'version'}, 0, 50) if $_->{'version'} and length($_->{'version'}) > 50;
        # special cases
        # Mozilla
        $_->{'agent'} = 'Mozilla' if $_->{'agent'} eq '•Mozilla';
        $_->{'agent'} =~ s/^Mozilla //;
        # Forte Agent
        $_->{'agent'} = 'Forte Agent' if $_->{'agent'} eq 'ForteAgent';
        if ($_->{'agent'} eq 'Forte Agent') {
          $_->{'version'} =~ s/-/\//;
          $_->{'version'} = '' if $_->{'version'} eq '32Bit';
        }
        # count client ('ALL') and client/version (if version is present)
        $Postings{$_->{'agent'}}{'ALL'}++;
        $Postings{$_->{'agent'}}{$_->{'version'}}++ if $_->{'version'};

        printf("%s: %s {%s}\n", $Header{'message-id'}, $_->{'agent'},
                                $_->{'version'} ? $Postings{$_->{'agent'}}{$_->{'version'}} : '')
                                if ($MID or $Debug && $Debug >1);
      }
    }
  };

  # delete old data for that month
  if (!$Test) {
    $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
                                     $DBClients),undef,$Month)
      or &Bleat(2,sprintf("Can't delete old client data for %s from %s: ".
                          "$DBI::errstr\n",$Month,$DBClients));
  };

  foreach my $Client (sort keys %Postings) {
    foreach my $Version (sort keys %{$Postings{$Client}}) {
      printf ("%s {%s}: %d\n",$Client,$Version,$Postings{$Client}{$Version}) if $Debug;

      if (!$Test) {
        # write to database
        $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
                                              "(month,client,version,postings) ".
                                              "VALUES (?, ?, ?, ?)",$DBClients));
        $DBQuery->execute($Month, $Client, $Version, $Postings{$Client}{$Version})
          or &Bleat(2,sprintf("Can't write groups data for %s/%s/%s to %s: ".
                              "$DBI::errstr\n",$Month,$Client,$Version,$DBClients));
        $DBQuery->finish;
      };
    }
  };

};

sub GetHeaders {
### ----------------------------------------------------------------------------
### get (newsgroups and) raw headers from database
### IN : $DBHandle: database handle
###      $DBRaw   : database table for raw data (to read from)
###      $Month   : current month to do
###      $MID     : specific Message-ID to fetch (testing purposes)
### OUT: DBI statement handle
  my ($DBHandle,$DBRaw,$Month,$MID) = @_;

  my $DBQuery;

  if (!$MID) {
    # get raw header data from raw table for given month
    $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups,headers FROM %s ".
                                             "WHERE day LIKE ? AND NOT disregard",
                                             $DBRaw));
    $DBQuery->execute($Month.'-%')
      or &Bleat(2,sprintf("Can't get header data for %s from %s: ".
                          "$DBI::errstr\n",$Month,$DBRaw));
  } else {
    $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups,headers FROM %s ".
                                             "WHERE mid = ?", $DBRaw));
    $DBQuery->execute($MID)
      or &Bleat(2,sprintf("Can't get header data for %s from %s: ".
                          "$DBI::errstr\n",$MID,$DBRaw));
  }
  return $DBQuery;
}

sub CheckTLH {
### ----------------------------------------------------------------------------
### count newsgroups from legal TLH(s)
### IN : $Newsgroups: comma separated list of newsgroups
###      $TLH       : (reference to an array of) legal TLH(s)
### OUT: number of newsgroups from legal TLH(s)
  my ($Newsgroups,$TLH) = @_;

  my (@TLH,$GroupCount);

  # fill @TLH from $TLH, which can be an array reference or a scalar value
  if (ref($TLH) eq 'ARRAY') {
    @TLH = @{$TLH};
  } else {
    push @TLH, $TLH;
  }

  # remove whitespace from contents of Newsgroups:
  chomp($Newsgroups);
  $Newsgroups =~ s/\s//;
  for (split /,/, $Newsgroups) {
    my $Newsgroup = $_;
    foreach (@TLH) {
      # increment $GroupCount if $Newsgroup starts with $TLH
      $GroupCount++ if $Newsgroup =~ /^$_/;
    }
  };

  return $GroupCount;
}

sub RemoveComments {
### ----------------------------------------------------------------------------
### remove comments and other junk from header
### IN : $Header: a header
### OUT: the header, with comments and other junk removed
  my $Header = shift;

  # decode MIME encoded words
  if ($Header =~ /=\?\S+\?[BQ]\?/) {
    $Header = decode("MIME-Header",$Header);
  }
  
  # remove nested comments from '(' to first ')'
  while ($Header =~ /\([^)]+\)/) {
    $Header =~ s/\([^()]+?\)//;
  }

  # remove dangling ')'
  $Header =~ s/\S+\)//;

  # remove from dangling '(' to end of header
  $Header =~ s/\(.+$//;

  # remove from '[' to first ']'
  $Header =~ s/\[[^\[\]]+?\]//;

  # remove 'Nr. ... lebt'
  $Header =~ s/Nr\. \d+ lebt//;

  # remove nn:nn:nn
  $Header =~ s/\d\d:\d\d:\d\d//;

  # remove 'mm/... '
  $Header =~ s/\/mm\/\S+//;

  # remove ' DE' / _DE'
  $Header =~ s/[ _]DE//;

  # remove trailing 'eol' or '-shl'
  # or ml-inews[-sig]
  $Header =~ s/(eol)|(-shl)|(ml-inews(-sig)?)$//;

  # remove from ';' or ',' (CrossPoint)
  # or '&' to end of header
  $Header =~ s/[;,&].+$//;

  # remove from 'by ' or 'unter Windows' or '@ Windows'
  # to end of header
  $Header =~ s/((by )|(unter +Windows)|(@ Windows)).+$//;

  # remove superfluous whitespace in header
  # and whitespace around header
  $Header =~ s/\s+/ /g;
  $Header =~ s/^\s+|\s+$//g;

  return $Header;
}

sub ParseXNewsreader {
### ----------------------------------------------------------------------------
### parse X-Newsreader header (client and version, if present)
### IN : $XNR: a X-Newsreader header
### OUT: client and version, if present
  my $XNR = shift;

  my ($Client, $Version);

  foreach (split(/ /,$XNR)) {
    # add to client name if no digit present
    if (!/\d[0-9.]/ or /\/\d$/) {
      $Client .= $_ . ' ' ;
    # otherwise, use as version and terminate parsing
    } else {
      $Version = $_;
      last;
    }
  }

  # remove trailing whitespace
  $Client =~ s/\s+$// if $Client;

  # set $Version
  $Version = '' if !$Version;
   
  return $Client, $Version;
}


sub ParseUserAgent {
### ----------------------------------------------------------------------------
### parse User-Agent header (agent and version)
### IN : $UserAgent: a User-Agent header
### OUT: array of hashes (agent/version)
  my $UserAgent = shift;

  my @UserAgents;

  # a well-formed User-Agent header will contain pairs of
  # client/version, i.e. 'slrn/0.9.7.3'
  foreach (split(/ /,$UserAgent)) {
    my %UserAgent;
    /^(.+)\/(.+)$/;
    $UserAgent{'agent'}   = $1;
    $UserAgent{'version'} = $2;
    push @UserAgents, { %UserAgent };
  }
  
  return @UserAgents;
}

__END__

################################ Documentation #################################

=head1 NAME

gatherstats - process statistical data from a raw source

=head1 SYNOPSIS

B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--conffile> I<filename>]

=head1 REQUIREMENTS

See L<doc/README>.

=head1 DESCRIPTION

This script will extract and process statistical information from a
database table which is fed from F<feedlog.pl> for a given time period
and write its results to (an)other database table(s). Entries marked
with I<'disregard'> in the database will be ignored; currently, you
have to set this flag yourself, using your database management tools.
You can exclude erroneous entries that way (e.g. automatic reposts
(think of cancels flood and resurrectors); spam; ...).

The time period to act on defaults to last month; you can assign
another time period or a single month via the B<--month> option (see
below).

By default B<gatherstats> will process all types of information; you
can change that using the B<--stats> option and assigning the type of
information to process.

Possible information types include:

=over 3

=item B<groups> (postings per group per month)

B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
counted for each single group they appear in. Groups not in I<TLH>
will be ignored.

B<gatherstats> will also add up the number of postings for each
hierarchy level, but only count each posting once. A posting to
de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
respectively. A crossposting to de.alt.test and de.alt.admin, on the
other hand, will be counted for de.alt.test and de.alt.admin each, but
only once for de.alt.ALL and de.ALL.

Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
override that default through the B<--groupsdb> option.

=item B<hosts> (postings from host per month)

B<gatherstats> will examine Injection-Info:, X-Trace: and Path:
headers and try to normalize them. The sum of all detected hosts will
also be saved for each month. Groups not in I<TLH> will be ignored.

Data is written to I<DBTableHosts> (see L<doc/INSTALL>); you can
override that default through the B<--hostsdb> option.

=item B<clients> (postings by client per month)

B<gatherstats> will examine User-Agent:, X-Newsreader: and X-Mailer:
headers and try to remove comments and non-standard contents. Clients
and client versions are counted separately. The sum of all detected
clients will also be saved for each month. Groups not in I<TLH> will
be ignored. 

Data is written to I<DBTableClnts> (see L<doc/INSTALL>); you can
override that default through the B<--clientsdb> option.

=back

=head2 Configuration

B<gatherstats> will read its configuration from F<newsstats.conf>
which should be present in etc/ via Config::Auto or from a configuration
file submitted by the B<--conffile> option.

See L<doc/INSTALL> for an overview of possible configuration options.

You can override configuration options by using the B<--hierarchy>,
B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
respectively.

=head1 OPTIONS

=over 3

=item B<-V>, B<--version>

Display version and copyright information and exit.

=item B<-h>, B<--help>

Display this man page and exit.

=item B<-d>, B<--debug>

Print debugging information to STDOUT while processing (number of
postings per group).

=item B<-t>, B<--test>

Do not write results to database. You should use B<--debug> in
conjunction with B<--test> ... everything else seems a bit pointless.

=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>

Set processing period to a single month in YYYY-MM format or to a time
period between two month in YYYY-MM:YYYY-MM format (two month, separated
by a colon). Defaults to last month.

=item B<-s>, B<--stats> I<type>

Set processing type to one of I<all>, I<groups>, I<hosts> or I<clients>.
Defaults to I<all>.

=item B<-c>, B<--checkgroups> I<filename template>

Relevant only for newsgroup stats (I<groups>).

Check each group against a list of valid newsgroups read from a file,
one group on each line and ignoring everything after the first
whitespace (so you can use a file in checkgroups format or (part of)
your INN active file).

The filename is taken from I<filename template>, amended by each
B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
so that

    gatherstats -m 2010-01:2010-12 -c checkgroups

will check against F<checkgroups-2010-01> for January 2010, against
F<checkgroups-2010-02> for February 2010 and so on.

Newsgroups not found in the checkgroups file will be dropped (and
logged to STDERR), and newsgroups found there but having no postings
will be added with a count of 0 (and logged to STDERR).

=item B<--hierarchy> I<TLH> (newsgroup hierarchy/hierarchies)

Override I<TLH> from F<newsstats.conf>.

I<TLH> can be a single word or a comma-separated list.

=item B<--rawdb> I<table> (raw data table)

Override I<DBTableRaw> from F<newsstats.conf>.

=item B<--groupsdb> I<table> (postings per group table)

Override I<DBTableGrps> from F<newsstats.conf>.

=item B<--hostsdb> I<table> (host data table)

Override I<DBTableHosts> from F<newsstats.conf>.

=item B<--clientsdb> I<table> (client data table)

Override I<DBTableClnts> from F<newsstats.conf>.

=item B<--conffile> I<filename>

Read configuration from I<filename> instead of F<newsstats.conf>.

=back

=head1 INSTALLATION

See L<doc/INSTALL>.

=head1 EXAMPLES

Process all types of information for lasth month:

    gatherstats

Do a dry run, showing results of processing:

    gatherstats --debug --test

Process all types of information for January of 2010:

    gatherstats --month 2010-01

Process only number of postings for the year of 2010,
checking against checkgroups-*:

    gatherstats -m 2010-01:2010-12 -s groups -c checkgroups

=head1 FILES

=over 4

=item F<bin/gatherstats.pl>

The script itself.

=item F<lib/NewsStats.pm>

Library functions for the NewsStats package.

=item F<etc/newsstats.conf>

Runtime configuration file.

=back

=head1 BUGS

Please report any bugs or feature requests to the author or use the
bug tracker at L<https://code.virtcomm.de/thh/newsstats/issues>!

=head1 SEE ALSO

=over 2

=item -

L<doc/README>

=item -

L<doc/INSTALL>

=back

This script is part of the B<NewsStats> package.

=head1 AUTHOR

Thomas Hochstein <thh@thh.name>

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2010-2013, 2025 Thomas Hochstein <thh@thh.name>

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.

=cut
