#!/usr/bin/perl
#
use strict;
use warnings;

use Time::Local qw( timelocal );
use POSIX qw( strftime );

my $DEBUG = 0;

my $MIN_WGET_GAP = 3;    # In seconds

my $conf = "myrt.conf";
open CONF, $conf or die "open $conf: \n";

# Add YOUR mail address HERE - it won't run until you do!
# my $user_agent = "my.name\@my.mail.domain auto-fetcher";
my $last_wget_start;

my %lcn_to_name = ();

sub open_rt {
    my ($lcn, $rtcn, $name) = @_;
    my $uri = "http://xmltv.radiotimes.com/xmltv/$rtcn.dat";
    print STDERR "Getting $name($rtcn) for channel $lcn\n" if $DEBUG;
    my $rth;
    open $rth, "-|", "wget", "-T", "100",
        "--header=User-Agent: $user_agent", "-O-", "-q", $uri
        or do {
        my $o_err = $!;
        close $rth;
        print STDERR "wget failed: $o_err($!)\n";
        exit 1;
        };
    $last_wget_start = time();
    return $rth;
}

# Global array and function for character mapping.
# The xmltv data contains what looks a bit like the MHEG standard for
# accented characters, but it's actually different.
# (see:  http://www.dtg.org.uk/reference/mheg_profile1_05.pdf sect 5.10)
# This code is here to map the 2-byte sequences which xmltv sends into
# the correct (or relevant) iso8859-9 character (which is what the
# toppy's display uses).
# (for listing see:  http://www.langbox.com/codeset/iso8859-9.html)
#
my %BBC_CMap = (
    '|' => "\xa6",    # Split V-bar (| is our delimiter)

# Jan 2008 - Seemed to switch to utf-8 encoding?
# Does this negate the above ones?  We'll see.
# See end of script for old mappings.
# Not much left here now...
#
);

# This function is called from the regex matching.
# If $from is a string of whitespace we return a single space
# If we have a mapping we return it.
# Otherwise we return a currency character (circle with ticks at ne, se,
# sw, nw) so that it will show up and set off an investigation to
# determine the correct mapping.
#
our $this_line;    # So we can show it on unknown mapping
our @non_utf8_mapping;

# With the switch to utf-8, RT was seen to send this for what looks
# like left and right-(double)quote and apostrophe
#
# Also seen
#    "\xc3\xa2\xc2\x80\xc2\x94"
#  in the context it probably meant something like a long-hyphen
#
# All weird ones now grouped into:
#   code => [ text_descr, map_to ]
#
# Some sense was made of these entries by:
# \xc3\xa2 => \xe2, followed by ignoring the \xc2 bytes
# so "\xc3\xa2\xc2\x80\xc2\x93" => "\xe2\x80\x93" == en-dash
#   It turns out that this is double-enconding - to get what was
# meant you need to double-decode utf8.
#
our %weird_map = (
    "\xc3\xa2\xc2\x80\xc2\x93" => ['"long" en-dash',            "-"],
    "\xc3\xa2\xc2\x80\xc2\x94" => ['"long" em-dash',            "--"],
    "\xc3\xa2\xc2\x80\xc2\x98" => ['"long" left single-quote',  "'"],
    "\xc3\xa2\xc2\x80\xc2\x99" => ['"long" right single-quote', "'"],
    "\xc3\xa2\xc2\x80\xc2\x9c" => ['"long" left double-quote',  '"'],
    "\xc3\xa2\xc2\x80\xc2\x9d" => ['"long" right double-quote', '"'],
    "\xc3\xa2\xc2\x80\xc2\xa6" => ['"long" horizontal elipsis', '...'],
    "\xc3\xa2\xc2\x82\xc2\xac" => ['"long" euro',               '(euro)'],
);
our $wrx = qr/\xc3\xa2\xc2.\xc2./;    # Generic 6-char regex for %weird_map

# In the same vein, we have this.
# "\xc3\xaf\xc2\xbf\xc2\xbd" => "\xef\xbf\xbd" == REPLACEMENT CHARACTER
#  (used to replace an incoming character whose value is unknown or
#    unrepresentable in Unicode)
# In the specific context seen an é (e-acute) was expected, but it is
# quite possible that the intention was an "unknown", so that is used.
# I've now also seen "\xef\xbf\xbd" directly in the stream, and in
# that case it meant "'" (and e-acute, and e-umlaut).
# So map both to \xAC (not sign) and note...
#
our $long_repl_char  = "\xc3\xaf\xc2\xbf\xc2\xbd";
our $short_repl_char = "\xef\xbf\xbd";
our $repl_count      = 0;
our $UNKNOWN         = "\xa9";

our (@unknown, @weird);

sub bytes_to_text {
    my $res;
    foreach my $ch (split("", $_[0])) {
        $res .= sprintf '\x%02x', ord($ch);
    }
    return $res;
}

sub do_map {
    my $from = shift;

    if ($from =~ /^\s/) {    # Special case whitespace matcher
        return (" ");
    }

    if ($from eq $long_repl_char) {
        push @weird, "long_repl_char ($long_repl_char) seen!";
        $repl_count++;
        return $UNKNOWN;
    }
    if ($from eq $short_repl_char) {
        push @weird, "short_repl_char ($short_repl_char) seen!";
        $repl_count++;
        return $UNKNOWN;
    }

# Do the generic utf-8 mapping. The rule here is:
#
#    \xc2\(\x80-\x9f]) => $1 - \x40     (odd - but seen - now removed...)
#    \xc2([\xa0-\xbf]) => $1
#    \xc3([\x80-\xbf]) => $1 + \x40
#
# with these "failings" for iso8859-9 in place of 8859-1
#
#    \xc3\x90   (capital Eth)   => \xc4\x9e   (G with breve)
#    \xc3\x9d   (Y with acute)  => \xc4\xb0   (I with dot above)
#    \xc3\x9e   (capital Thorn) => \xc5\x9e   (S with cedilla)
#    \xc3\xb0   (small eth)     => \xc7\x9f   (g with breve)
#    \xc3\xbd   (y with acute)  => \xc4\xb1   (dotless i)
#    \xc3\xbe   (small thorn)   => \xc5\x9e   (s with cedilla)
#
# They have no iso8859-9 equiv for the Toppy, so just note their presence.
#
# Not sure whether this is needed - uncomment if non-break space shows up
# weirdly...
#
#    if ($from eq "\xc2\xa0") {  # Non-break space => space
#        return " ";
#    }
    if ($from =~ /^\xc2([\xa0-\xbf])$/) {
        return $1;
    }
# None of the following are really defined (they should also map to $1,
# as above.  We just need to note them and seee what was meant
#

    if ($from =~ /^\xc2([\x80-\x9f])$/) {
        my $mapped;
        my ($desc, $tag);
        if (ord($1) == 0x91) {    # Seen as "left-quote"
            $mapped = "'";
            $desc = 'odd';
            $tag = "(?)";
        }
        elsif (ord($1) == 0x85) { # This "next Line" 
# http://forum.toppy.org.uk/forum/viewtopic.php?p=118761#118761
# \x8a is half-font-height
            $mapped = "\x8a\x8a";
            $desc = 'odd';
            $tag = "(?)";
        }
        else {
            $mapped = chr(ord($1));
            $desc = 'weird';
            $tag = "";
        }
        my $ft = bytes_to_text($from);
        push @weird, "$desc mid \\xc2 ($from ($ft) => >>$mapped<<$tag) seen!";
        return $mapped;
    }
    if ($from =~ /^$wrx$/) {
        if (exists $weird_map{$from}) {
            my $ft = bytes_to_text($from);
            push @weird,
          "$weird_map{$from}[0] ($from ($ft) => $weird_map{$from}[1]) seen!";
            return $weird_map{$from}[1];
        }
        (my $in_hex = $from) =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg;
        push @unknown, "$from ($in_hex)";
        return "\xa4";
    }
    if ($from =~ /^\xc3([\x80-\xbf])$/) {
        my $tchar = $1;
        if ($tchar =~ /^\x90|\x9d|\x9e|\xb0|\xbd|\xbe$/) {
            (my $in_hex = $from)
                =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg;
            push @unknown, "Non iso8859-9 char seen ($in_hex)";
        }
        return chr(ord($tchar) + 0x40);
    }

    if (exists $BBC_CMap{$from}) {    # Use mapping if we have one
        my $new_ch = $BBC_CMap{$from};
        my $rpt;
        if (substr($from, 0, 1) eq "&") {
            $rpt = "HTML entity for $new_ch";
        }
        elsif (substr($from, 0, 1) lt "\x80") {
            $rpt = "simple replace with $new_ch";
        }
        else {
            ($rpt = $from) =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg;
            $rpt .= " mapped to $new_ch";
        }
        push @non_utf8_mapping, "$from ($rpt)";
        return $BBC_CMap{$from};
    }

# Generic 3-digit entity (<=255) handled generically
#
    if ($from =~ /^&#(\d{3});$/) {
        my $ent = $1 + 0;
        if ($ent <= 255) {    # Only cater for single bytes
            my $map_ch = chr($ent);
            push @non_utf8_mapping, "$from (generic HTML entity => $map_ch)";
            return $map_ch;
        }
    }

# Hmmm - unknown.  Report it and return currency (spaceman)
#
    (my $in_hex = $from) =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg;
    push @unknown, "$from ($in_hex)";
    return "\xa4";
}

my @days_in = qw( 0 31 28 31 30 31 30 31 31 30 31 30 31 );    # 1-based

my $all_earliest = my $earliest = 2**32 - 1;
my $all_latest   = my $latest   = 0;

# Edit some fields for odd characters.  This is designed to trap things
# we aren't handling (but wish to know about) as well as things we do know
# how to handle.
#
my $patterns = qr/(\s{2,}|              # multi-whitespace
                   \||                  # | (used as field separator)
                   $long_repl_char|     # Oddly unknown
                   $short_repl_char|    # Le oddly unknown
                   $wrx|                # All the weird ones
                   [\xc2-\xc3].|        # extended chars 1 (2-bytes)
                   &\#\d+;|             # HTML entities (not comment!)
                   [\x80-\xff])         # Anything else with top-bit set
                   /x;

my $today = strftime("%Y%m%d", localtime());

my $tot_non_utf8_mapping = my $tot_unknown = my $tot_weird = 0;

my $ignore_ending_before = $^T - (4*3_600);

sub reformat {
    my ($lcn, $line, $ch_start, $ch_end) = @_;
    chomp $line;
    $line =~ s/\x00|\x0d//g;   # Nuls have been seen :-(. Remove any <CR> too.

# We trim fields by adding any whitespace into the split
#
    my ($rt_title,       $rt_sub_title,       $rt_episode,
        $rt_year,        $rt_director,        $rt_cast,
        $rt_premiere,    $rt_film,            $rt_repeat,
        $rt_subtitles,   $rt_widescreen,      $rt_new_series,
        $rt_deaf_signed, $rt_black_and_white, $rt_star_rating,
        $rt_certificate, $rt_genre,           $rt_desc,
        $rt_choice,      $rt_date,            $rt_start,
        $rt_stop,        $rt_duration_mins
    ) = split(/\s*~\s*/, $line);

# Data now starts with a note about personal-use only...
    return unless (defined $rt_date);

    my ($day, $month, $year) = split("/", $rt_date);

# Ignore anything starting before today...
# ...this stops us counting it towards valid days' data reporting.
# Actually - that was a bad idea for when you run this just after
#  midnight - you miss the current programs
# So ignore anything *ending* before ~4 hours before now.
# Which we have to do several lines later
#
#    return if ($year . $month . $day lt $today);
    my ($start_hr, $start_min) = split(":", $rt_start);
    my ($stop_hr,  $stop_min)  = split(":", $rt_stop);

# Get the epoch time, so we can remember the earliest and latest and
# hence get the time-span of data
# Note that the times have no dates, so when we span over midnight
# we'll get a stop time before the start time.  So add 1 day.
#
    my $ep_start
        = timelocal(0, $start_min, $start_hr, $day, $month - 1, $year);
    my $ep_stop = timelocal(0, $stop_min, $stop_hr, $day, $month - 1, $year);

# If we just add another 86_400 here it fails around DST changes
# Use the day/month/year fields, but the given stop hr+min.
# Note that we add 1d + 1h + 1m, so that we *do* advance to the next day
# even when it has 24 hours...
#
    if ($ep_stop < $ep_start) {
        my @newday = localtime($ep_stop + 86_400 + 3_601);
        $ep_stop = timelocal(0, $stop_min, $stop_hr, @newday[3, 4, 5]);
    }

# Only now can we decide when it really ends and choose to ignore
# it as too old.
#
    return if ($ep_stop <= $ignore_ending_before);
    $earliest = $ep_start if ($ep_start < $earliest);
    $latest   = $ep_stop  if ($ep_stop > $latest);

# If a limited channel, work out whether this is truncated
# This code assumes that no programme last for 24 hours or more, since
# it ignores days.
#
    my $clipped = "";
    if (defined $ch_end) {
        my $skip = 0;
        if ($ch_start gt $ch_end) {    # String compares work!

# If stop time < start time then there are effectively 3 periods
# (morning, day-gap, night)
# Handle according to where the start time falls.
#
            if ($rt_start lt $ch_end) {    # Starts in morning service
                if ($rt_stop gt $ch_end) {
                    $clipped = "(Programme end at $rt_stop will be missed) ";
                    ($stop_hr, $stop_min) = split(":", $ch_end);
                }
            }
            elsif ($rt_start lt $ch_start) {    # Starts in day-gap
                if ($rt_stop gt $ch_start) {
                    $clipped
                        = "(Programme start at $rt_start will be missed) ";
                    ($start_hr, $start_min) = split(":", $ch_start);
                }
                else {
                    $skip = 1;                  # Entirely in gap
                }
            }
            else {    # Starts in night service - does it run over midnight
                if (($rt_stop lt $rt_start) and ($rt_stop gt $ch_end)) {
                    $clipped = "(Programme end at $rt_stop will be missed) ";
                    ($stop_hr, $stop_min) = split(":", $ch_start);
                }
            }
        }
        else {        # Single day period
            if (($rt_start lt $ch_start) and ($rt_stop le $ch_start)) {
                $skip = 1;    # Skip this program - all over before start
            }
            elsif (($rt_start ge $ch_end) and ($rt_stop gt $ch_end)) {
                $skip = 1;    # Skip this program - all run after end
            }
            elsif (($rt_start ge $ch_end) and ($rt_stop le $ch_start)) {
                $skip = 1;    # Skip this program - between today end
                              # and tomorrow start
            }
            elsif ($rt_start lt $ch_start) {
                $clipped = "(Programme start at $rt_start will be missed) ";
                ($start_hr, $start_min) = split(":", $ch_start);
            }
            elsif ($rt_stop gt $ch_end) {
                $clipped = "(Programme end at $rt_stop will be missed) ";
                ($stop_hr, $stop_min) = split(":", $ch_end);
            }
        }
        return if ($skip);
    }

# Around DLST changeover times they may include timezone at the start
# of the title in ().
#
    $rt_title =~ s/^\((GMT|UTC\+1|BST|UTC)\)\s*//;

# Tweak episode/sub_title.
# I've removed this code since the only time I've seen a match was for
# a program about 9/11 (where it was obviously part of the actual title).
# Just move it to the sub-title.
#
###    if ($rt_episode) {
###        if ($rt_episode =~ m|^(\d+/\d+)\s*(.*)|) {
###            $rt_episode = $1;
###            $rt_sub_title = $2;
###        }
###        else {
###            $rt_sub_title = $rt_episode;
###            $rt_episode = '';
###        }
###    }
    $rt_sub_title = $rt_episode;
    $rt_episode   = '';

# From EMJB on Toppy Forum
# GML - changed to precede description, so it will show under MS6
#
#   $rt_title .= ' (Rpt)' if ($rt_repeat eq 'true');
    $rt_desc = '(Rpt) '.$rt_desc if ($rt_repeat eq 'true');

    @non_utf8_mapping = @unknown = @weird = ();
    foreach my $tr_field ($rt_title, $rt_desc, $rt_sub_title) {
        next if ($tr_field eq "");
        $tr_field =~ s/$patterns/do_map($1)/eg;
    }

    my $do_line = 0;
    if (@non_utf8_mapping) {
        print STDERR "Non-utf8 mapping done:";
        print STDERR join("\n   ", "", @non_utf8_mapping), "\n";
        $do_line = 1;
        $tot_non_utf8_mapping += @non_utf8_mapping;
    }
    if (@weird) {
        printf STDERR "Weird mapping%s for:", (@weird > 1)? "s": "";
        print STDERR join("\n   ", "", @weird), "\n";
        $do_line = 1;
        $tot_weird += @weird;
    }
    if (@unknown) {
        printf STDERR "Unknown mapping%s for:", (@unknown > 1)? "s": "";
        print STDERR join("\n   ", "", @unknown), "\n";
        $do_line = 1;
        $tot_unknown += @unknown;
    }

# Strip CRs off output - Debian mailx treats this as a non-text file
#
    {
        (my $rpt_line = $this_line) =~ s/\r+$//;
        print STDERR " (in $lcn_to_name{$lcn} - channel $lcn):\n$rpt_line"
            if ($do_line);
    }

# Iff the duration is > 240mins we need to split the programme, so that
# MyStuff will still know about it after 4 hours
#
    for (my $start = $ep_start; $start < $ep_stop; $start += 14_400) {
        my $mei_str = "";
        {
            my ($t_min, $t_hr, $t_day, $t_mon, $t_yr);
            ($t_min, $t_hr, $t_day, $t_mon, $t_yr)
                = (localtime($start))[1, 2, 3, 4, 5];

# Construct the string for the line in the mei file
            $mei_str .= sprintf(
                "%4d%02d%02d%02d%02d",
                $t_yr + 1900,
                $t_mon + 1, $t_day, $t_hr, $t_min
            );

            my $stop = $start + 14_400;
            $stop = $ep_stop if ($stop > $ep_stop);
            ($t_min, $t_hr, $t_day, $t_mon, $t_yr)
                = (localtime($stop))[1, 2, 3, 4, 5];
            $mei_str .= sprintf(
                "|%4d%02d%02d%02d%02d",
                $t_yr + 1900,
                $t_mon + 1, $t_day, $t_hr, $t_min
            );
        }
        $mei_str .= "|$lcn";             # LCN of channel
        $mei_str .= "|$rt_title";        # program title
        $mei_str .= "|$rt_sub_title";    # secondary title, e.g. episode title
        $mei_str
            .= "|".(($rt_repeat eq 'true')? 'TRUE': ''); # if this is a repeat
        $mei_str .= "|"
            ;    # if this is first screening of a new show - not in RT data
        $mei_str .= "|$rt_year";    # production date
        $mei_str .= "|"
            .(($rt_premiere eq 'true')? $rt_genre.' Premiere': '')
            ;                       # if it is a premiere
        $mei_str .= "|";                     # last chance
        $mei_str .= "|$clipped$rt_desc";     # program description
# Outsourcing from BBC meant that Film was no longer a genre - but the flag 
# is set correctly.
#
        $rt_genre = 'Film' if ($rt_film eq 'true');
        $mei_str .= "|$rt_genre";            # program genre
        $mei_str .= "|$rt_duration_mins";    # program length
        $mei_str .= "|".(($rt_widescreen eq 'true')? '16:9': ''); # widescreen
        $mei_str .= "|"
            .(($rt_black_and_white eq 'true')? 'no': '')
            ;    # whether it is in colour
        $mei_str .= "|"
            .(($rt_subtitles eq 'true')? 'teletext': '')
            ;    # whether there are subtitles
        $mei_str .= "|$rt_episode";    # episode number
        $mei_str .= "|"
            .(($rt_episode ne '')? 'xmltv_ns': '');    # episodes start at 1
        $mei_str .= "|".(($rt_certificate ne '')? 'BBFC': ''); # rating system
        $mei_str .= "|$rt_certificate";                        # rating
        $mei_str .= "|"
            .(($rt_star_rating ne '')? $rt_star_rating.'/5': '')
            ;                                                  # star rating
        $mei_str = sprintf("%06d", length($mei_str))."|$mei_str\r\n";
        print $mei_str;
    }
}

# @wget_list is a list of entries containing:
#
#  $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name), $ch_start, $ch_end,
#      $expect_empty
#
# We push an entry onto it then shift the previous one off, so it should
# only ever have at most 2 entries on it.
#
my @wget_list;
my $current;
while (<CONF>) {
    $_ =~ s/#.*//;
    $_ =~ s/^\s+//;
    $_ =~ s/\s+$//;
    my ($lcn, $rtcn, $name, $ch_start, $ch_end) = split(/\s*,\s*/, $_);
    next unless (defined $name);
    $lcn_to_name{$lcn} = $name;
    my $expect_empty = 0;
    if (substr($rtcn, -1, 1) eq '?') {
        $rtcn = substr($rtcn, 0, -1);
        $expect_empty = 1;
    }
    if ($rtcn < 0) {
        print STDERR "Skipping UNKNOWN $name($rtcn) for channel $lcn\n";
        next;
    }
    unless (@wget_list) {    # Get a current on first pass
        push @wget_list, [
            $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name),
            $ch_start, $ch_end, $expect_empty
            ];
        next;
    }

# Start processing what we already have
#
    my ($cur_lcn, $cur_rtcn, $cur_name, $cur_wget,
        $cur_st,  $cur_end,  $cur_ee
    ) = @{shift @wget_list};
    $this_line = '';
    $earliest  = 2**32 - 1;
    $latest    = 0;
    while (<$cur_wget>) {
        if (not @wget_list and (time() - $last_wget_start) >= $MIN_WGET_GAP) {
            push @wget_list, [
                $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name),
                $ch_start, $ch_end, $expect_empty
                ];
        }
        next unless (/\S/);
        next if (/^In accessing this XML feed/);
        $this_line = $_;    # So we can show it on unknown mapping (v.u.)
        reformat($cur_lcn, $_, $cur_st, $cur_end);
    }
    close $cur_wget or die "wget failed: $!\n";
    $all_earliest = $earliest if ($earliest < $all_earliest);
    $all_latest   = $latest   if ($latest > $all_latest);
    $earliest = 0 if ($latest == 0);    # == Nothing found....
    {
        my $days_data = ($latest - $earliest)/86_400;
        my $data_warn = ($days_data < 12)? "ONLY ": "";
        printf STDERR "Found %s%0.1f days data for %s(%s) ch %d\n", $data_warn,
            $days_data, $cur_name, $cur_rtcn, $cur_lcn;
    }
    if (not $this_line) {
        if (not $cur_ee) {
            print STDERR
                "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was EMPTY!\n";
        }
        else {
            print STDERR
                "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was empty (expected)\n";
        }
    }
    if ($this_line and $cur_ee) {
        print STDERR
            "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was NOT EMPTY!\n";
    }
    if (not @wget_list) {
        my $wait = ($MIN_WGET_GAP - (time() - $last_wget_start));
        sleep $wait if ($wait > 0);
        push @wget_list, [
            $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name),
            $ch_start, $ch_end, $expect_empty
            ];
    }
}

# Handle the final one
#
my ($cur_lcn, $cur_rtcn, $cur_name, $cur_wget, $cur_st, $cur_end, $cur_ee)
    = @{shift @wget_list};
$this_line = '';
while (<$cur_wget>) {
    $this_line = $_;    # So we can show it on unknown mapping (v.u.)
    reformat($cur_lcn, $_, $cur_st, $cur_end);
}
if (not $this_line and not $cur_ee) {
    print STDERR "RT data $cur_rtcn for $cur_name($cur_lcn) was EMPTY!\n";
}
if ($this_line and $cur_ee) {
    print STDERR
        "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was NOT EMPTY!\n";
}

# Report on range of data
#
$all_earliest = 0 if ($all_latest == 0);    # == Nothing found....
my $days_data = ($all_latest - $all_earliest)/86_400;
my $data_warn = ($days_data < 12)? "ONLY ": "";
printf STDERR "*** STATS ***\n";
printf STDERR "%s%0.1f days data gathered\n", $data_warn, $days_data;

if ($tot_non_utf8_mapping) {
    print STDERR "$tot_non_utf8_mapping non_utf8_mapping items found\n";
}
if ($tot_unknown) {
    print STDERR "$tot_unknown unknown items found\n";
}
if ($tot_weird) {
    print STDERR "$tot_weird weird items found";
    print STDERR " ($repl_count unrepresentable replacements)"
        if ($repl_count);
    print STDERR "\n";
}

__END__
# Documentation
#
# Since these *may* now have all been replaced with "correct" utf-8
# mapppings I'll comment them out and wait for reports.
#
#    "\xc3\x97" => "\xb0",    # degree sign (see also \xc3\xa6)
#    "\xc3\xa1" => "\xad",    # A (soft) hyphen (also see \xc3\xf9)
#    "\xc3\xa6" => "\xb0",    # Another(?) degree sign
#    "\xc3\xbb" => "\xbd",    # 1/2 (a half)
#    "\xc3\xc6" => "'",       # A quote (Windows intelligent? r quote)
#    "\xc3\xe0" => "...",     # ellipsis
#    "\xc3\xe6" => "'",       # A quote (Windows intelligent? l quote)
#    "\xc3\xf4" => '"',       # Left double quote  (&#8223; 0xE2 0x80 0x9F)
#    "\xc3\xf6" => '"',       # Right double quote (&#8221; 0xE2 0x80 0x9D)
#    "\xc3\xf9" => "\xad",    # A (soft) hyphen
#    "\xc3\xfa" => "\xa3",    # Pound
#    "\xc3\xfb" => "\xad",    # dash?
#    "\xc8\x98" => "\xf1",    # n-tilde
#    "\xc8\xa1" => "\xed",    # i-acute
#    "\xc8\xa4" => "\xde",    # capital Thorn, but that is only in
#                             # iso8859-1. So possibly a mistake.
#                             # A ' was meant in the context seen.
#    "\xc8\xa8" => "\xe9",    # e-acute
#    "\xc8\xa9" => "\xfb",    # u_circumflex
#    "\xc8\xab" => "\xee",    # i-circumflex
#    "\xc8\xac" => "\xea",    # e-circumflex
#    "\xc8\xad" => "\xfc",    # u-umlaut
#    "\xc8\xaf" => "\xef",    # i-umlaut
#    "\xc8\xb5" => "\xf6",    # o-umlaut
#    "\xc8\xb9" => "\xf8",    # o-slash
#    "\xc8\xba" => "\xe7",    # c-cedilla
#    "\xc8\xbd" => "\xeb",    # e-umlaut
#    "\xc8\xbf" => "\xe8",    # e-grave
#    "\xc8\xd1" => "\xe5",    # a-ring
#    "\xc8\xe1" => "\xe0",    # a-grave
#    "\xc8\xe8" => "\xca",    # E-circumflex (albeit wrongly for the example)
#    "\xc8\xeb" => "\xc9",    # E-acute
#    "\xc8\xed" => "\xe1",    # a-acute
#    "\xc8\xf1" => "\xe4",    # a-umlaut
#    "\xc8\xf3" => "\xe2",    # a-circumflex
#    "\xc8\xfe" => "\xf3",    # o-acute

# Not only do top-bit chars get through, but so do html-entities.
# Single byte numeric ones are handled generically, but we handle
# multi-byte ones here, as we need to know what to map them too.
# Some XML-entity encodings mentioned by xmltv
#    "&#8212;"  => "--",      # Long dash
#    "&#8230;"  => "...",     # Elipsis
#    "&#64257;" => "fi",      # Ligature for fi
#
# Description
# Edited from Wooders PHP code.  This runs wget to fetch things
# asynchronously by processing the data form one while it sets off the
# next one (but there are still 3 second gaps between fetch starts).
# This doesn't handle favourites at all, so has a simplified config file
#
# Date      Programmer  Description
# 16 Dec 11 birdman     Film genre now set from flag.
# 29 Jun 11 birdman     Added bytes_to_text().
# 26 Jun 11 birdman     Added \xc2\x85 => "\x8a\x8a".  Might work...
# 25 Jun 11 birdman     Removed \xc2\(\x80-\x9f]) => $1 - \x40
# 07 Aug 10 birdman     map \xc2\x91 to quote.
#                       report per-channel data recoveries.
# 25 Oct 09 birdman     Put (Rpt) at start of description rather than in
#                       title as that:
#                        a) Aids searching on title
#                        b) Show up in MS6
# 15 Oct 09 birdman     Add 1d + 1h +1m on day wrap, to allow for 25 hour
#                        days at DST change.
# 12 Sep 09 birdman     Change logic on ignoring "old" data to use end time
#                       and handle cross-day date/timing.
# 25 Feb 09 birdman     Report total non-utf8/unknown/weird counts.
# 07 Nov 08 birdman     Map non-break space to space, as toppy seems
#                       to have no nbsp.
# 27 Oct 08 birdman     Fix day-wrap issues on midnights after DST changes.
# 25 Sep 08 birdman     Change weird_e_acute (and its short form) to
#                       long/short_repl_char and return "not sign".
# 09 Aug 08 birdman     Added 4-hr-max loop, and perltidy'ed
# 29 Jun 08 birdman     weird_unknown changed to weird_e_acute.
# 22 Jun 08 birdman     \xc2\(\x80-\x9f]) mapping added.
# 27 May 08 birdman     Added weird_unknown - map to \xa5.
# 14 May 08 birdman     Single wrx regex for weird_map and report
#                       any non-matched 6-char long chars from it.
# 16 Apr 08 birdman     Added "long" euro mapping to weird_map
# 01 Apr 08 birdman     Made some sense of the weird_map entries
# 14 Feb 08 birdman     Distinguish weird from truely unknown
# 31 Jan 08 birdman     More info on char mappings done.
# 10 Jan 08 birdman     Now using (simpler) utf-8 codes, mostly.
# 04 Jan 08 birdman     Fix $patterns so that # in regex isn't taken as
#                       a comment and tidy up report (multiply unknowns
#                       now reported before single printing of entry).
# 26 Oct 07 birdman     Report on empty channels
# 26 Sep 07 birdman     Fixed the code for limited channels that run
#                       over midnight
# ?? Apr 07 birdman     Guessing the date, but original perl version
#
##################################################################