#!/usr/bin/perl # use strict; use warnings; use Time::Local qw( timelocal ); use POSIX qw( strftime ); my $DEBUG = 0; my $MIN_WGET_GAP = 3; # In seconds my $conf = "myrt.conf"; open CONF, $conf or die "open $conf: \n"; # Add YOUR mail address HERE - it won't run until you do! # my $user_agent = "my.name\@my.mail.domain auto-fetcher"; my $last_wget_start; my %lcn_to_name = (); sub open_rt { my ($lcn, $rtcn, $name) = @_; my $uri = "http://xmltv.radiotimes.com/xmltv/$rtcn.dat"; print STDERR "Getting $name($rtcn) for channel $lcn\n" if $DEBUG; my $rth; open $rth, "-|", "wget", "-T", "100", "--header=User-Agent: $user_agent", "-O-", "-q", $uri or do { my $o_err = $!; close $rth; print STDERR "wget failed: $o_err($!)\n"; exit 1; }; $last_wget_start = time(); return $rth; } # Global array and function for character mapping. # The xmltv data contains what looks a bit like the MHEG standard for # accented characters, but it's actually different. # (see: http://www.dtg.org.uk/reference/mheg_profile1_05.pdf sect 5.10) # This code is here to map the 2-byte sequences which xmltv sends into # the correct (or relevant) iso8859-9 character (which is what the # toppy's display uses). # (for listing see: http://www.langbox.com/codeset/iso8859-9.html) # my %BBC_CMap = ( '|' => "\xa6", # Split V-bar (| is our delimiter) # Jan 2008 - Seemed to switch to utf-8 encoding? # Does this negate the above ones? We'll see. # See end of script for old mappings. # Not much left here now... # ); # This function is called from the regex matching. # If $from is a string of whitespace we return a single space # If we have a mapping we return it. # Otherwise we return a currency character (circle with ticks at ne, se, # sw, nw) so that it will show up and set off an investigation to # determine the correct mapping. # our $this_line; # So we can show it on unknown mapping our @non_utf8_mapping; # With the switch to utf-8, RT was seen to send this for what looks # like left and right-(double)quote and apostrophe # # Also seen # "\xc3\xa2\xc2\x80\xc2\x94" # in the context it probably meant something like a long-hyphen # # All weird ones now grouped into: # code => [ text_descr, map_to ] # # Some sense was made of these entries by: # \xc3\xa2 => \xe2, followed by ignoring the \xc2 bytes # so "\xc3\xa2\xc2\x80\xc2\x93" => "\xe2\x80\x93" == en-dash # It turns out that this is double-enconding - to get what was # meant you need to double-decode utf8. # our %weird_map = ( "\xc3\xa2\xc2\x80\xc2\x93" => ['"long" en-dash', "-"], "\xc3\xa2\xc2\x80\xc2\x94" => ['"long" em-dash', "--"], "\xc3\xa2\xc2\x80\xc2\x98" => ['"long" left single-quote', "'"], "\xc3\xa2\xc2\x80\xc2\x99" => ['"long" right single-quote', "'"], "\xc3\xa2\xc2\x80\xc2\x9c" => ['"long" left double-quote', '"'], "\xc3\xa2\xc2\x80\xc2\x9d" => ['"long" right double-quote', '"'], "\xc3\xa2\xc2\x80\xc2\xa6" => ['"long" horizontal elipsis', '...'], "\xc3\xa2\xc2\x82\xc2\xac" => ['"long" euro', '(euro)'], ); our $wrx = qr/\xc3\xa2\xc2.\xc2./; # Generic 6-char regex for %weird_map # In the same vein, we have this. # "\xc3\xaf\xc2\xbf\xc2\xbd" => "\xef\xbf\xbd" == REPLACEMENT CHARACTER # (used to replace an incoming character whose value is unknown or # unrepresentable in Unicode) # In the specific context seen an é (e-acute) was expected, but it is # quite possible that the intention was an "unknown", so that is used. # I've now also seen "\xef\xbf\xbd" directly in the stream, and in # that case it meant "'" (and e-acute, and e-umlaut). # So map both to \xAC (not sign) and note... # our $long_repl_char = "\xc3\xaf\xc2\xbf\xc2\xbd"; our $short_repl_char = "\xef\xbf\xbd"; our $repl_count = 0; our $UNKNOWN = "\xa9"; our (@unknown, @weird); sub bytes_to_text { my $res; foreach my $ch (split("", $_[0])) { $res .= sprintf '\x%02x', ord($ch); } return $res; } sub do_map { my $from = shift; if ($from =~ /^\s/) { # Special case whitespace matcher return (" "); } if ($from eq $long_repl_char) { push @weird, "long_repl_char ($long_repl_char) seen!"; $repl_count++; return $UNKNOWN; } if ($from eq $short_repl_char) { push @weird, "short_repl_char ($short_repl_char) seen!"; $repl_count++; return $UNKNOWN; } # Do the generic utf-8 mapping. The rule here is: # # \xc2\(\x80-\x9f]) => $1 - \x40 (odd - but seen - now removed...) # \xc2([\xa0-\xbf]) => $1 # \xc3([\x80-\xbf]) => $1 + \x40 # # with these "failings" for iso8859-9 in place of 8859-1 # # \xc3\x90 (capital Eth) => \xc4\x9e (G with breve) # \xc3\x9d (Y with acute) => \xc4\xb0 (I with dot above) # \xc3\x9e (capital Thorn) => \xc5\x9e (S with cedilla) # \xc3\xb0 (small eth) => \xc7\x9f (g with breve) # \xc3\xbd (y with acute) => \xc4\xb1 (dotless i) # \xc3\xbe (small thorn) => \xc5\x9e (s with cedilla) # # They have no iso8859-9 equiv for the Toppy, so just note their presence. # # Not sure whether this is needed - uncomment if non-break space shows up # weirdly... # # if ($from eq "\xc2\xa0") { # Non-break space => space # return " "; # } if ($from =~ /^\xc2([\xa0-\xbf])$/) { return $1; } # None of the following are really defined (they should also map to $1, # as above. We just need to note them and seee what was meant # if ($from =~ /^\xc2([\x80-\x9f])$/) { my $mapped; my ($desc, $tag); if (ord($1) == 0x91) { # Seen as "left-quote" $mapped = "'"; $desc = 'odd'; $tag = "(?)"; } elsif (ord($1) == 0x85) { # This "next Line" # http://forum.toppy.org.uk/forum/viewtopic.php?p=118761#118761 # \x8a is half-font-height $mapped = "\x8a\x8a"; $desc = 'odd'; $tag = "(?)"; } else { $mapped = chr(ord($1)); $desc = 'weird'; $tag = ""; } my $ft = bytes_to_text($from); push @weird, "$desc mid \\xc2 ($from ($ft) => >>$mapped<<$tag) seen!"; return $mapped; } if ($from =~ /^$wrx$/) { if (exists $weird_map{$from}) { my $ft = bytes_to_text($from); push @weird, "$weird_map{$from}[0] ($from ($ft) => $weird_map{$from}[1]) seen!"; return $weird_map{$from}[1]; } (my $in_hex = $from) =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg; push @unknown, "$from ($in_hex)"; return "\xa4"; } if ($from =~ /^\xc3([\x80-\xbf])$/) { my $tchar = $1; if ($tchar =~ /^\x90|\x9d|\x9e|\xb0|\xbd|\xbe$/) { (my $in_hex = $from) =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg; push @unknown, "Non iso8859-9 char seen ($in_hex)"; } return chr(ord($tchar) + 0x40); } if (exists $BBC_CMap{$from}) { # Use mapping if we have one my $new_ch = $BBC_CMap{$from}; my $rpt; if (substr($from, 0, 1) eq "&") { $rpt = "HTML entity for $new_ch"; } elsif (substr($from, 0, 1) lt "\x80") { $rpt = "simple replace with $new_ch"; } else { ($rpt = $from) =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg; $rpt .= " mapped to $new_ch"; } push @non_utf8_mapping, "$from ($rpt)"; return $BBC_CMap{$from}; } # Generic 3-digit entity (<=255) handled generically # if ($from =~ /^&#(\d{3});$/) { my $ent = $1 + 0; if ($ent <= 255) { # Only cater for single bytes my $map_ch = chr($ent); push @non_utf8_mapping, "$from (generic HTML entity => $map_ch)"; return $map_ch; } } # Hmmm - unknown. Report it and return currency (spaceman) # (my $in_hex = $from) =~ s/(.)/sprintf("\\x%02x", unpack("C", $1))/eg; push @unknown, "$from ($in_hex)"; return "\xa4"; } my @days_in = qw( 0 31 28 31 30 31 30 31 31 30 31 30 31 ); # 1-based my $all_earliest = my $earliest = 2**32 - 1; my $all_latest = my $latest = 0; # Edit some fields for odd characters. This is designed to trap things # we aren't handling (but wish to know about) as well as things we do know # how to handle. # my $patterns = qr/(\s{2,}| # multi-whitespace \|| # | (used as field separator) $long_repl_char| # Oddly unknown $short_repl_char| # Le oddly unknown $wrx| # All the weird ones [\xc2-\xc3].| # extended chars 1 (2-bytes) &\#\d+;| # HTML entities (not comment!) [\x80-\xff]) # Anything else with top-bit set /x; my $today = strftime("%Y%m%d", localtime()); my $tot_non_utf8_mapping = my $tot_unknown = my $tot_weird = 0; my $ignore_ending_before = $^T - (4*3_600); sub reformat { my ($lcn, $line, $ch_start, $ch_end) = @_; chomp $line; $line =~ s/\x00|\x0d//g; # Nuls have been seen :-(. Remove any too. # We trim fields by adding any whitespace into the split # my ($rt_title, $rt_sub_title, $rt_episode, $rt_year, $rt_director, $rt_cast, $rt_premiere, $rt_film, $rt_repeat, $rt_subtitles, $rt_widescreen, $rt_new_series, $rt_deaf_signed, $rt_black_and_white, $rt_star_rating, $rt_certificate, $rt_genre, $rt_desc, $rt_choice, $rt_date, $rt_start, $rt_stop, $rt_duration_mins ) = split(/\s*~\s*/, $line); # Data now starts with a note about personal-use only... return unless (defined $rt_date); my ($day, $month, $year) = split("/", $rt_date); # Ignore anything starting before today... # ...this stops us counting it towards valid days' data reporting. # Actually - that was a bad idea for when you run this just after # midnight - you miss the current programs # So ignore anything *ending* before ~4 hours before now. # Which we have to do several lines later # # return if ($year . $month . $day lt $today); my ($start_hr, $start_min) = split(":", $rt_start); my ($stop_hr, $stop_min) = split(":", $rt_stop); # Get the epoch time, so we can remember the earliest and latest and # hence get the time-span of data # Note that the times have no dates, so when we span over midnight # we'll get a stop time before the start time. So add 1 day. # my $ep_start = timelocal(0, $start_min, $start_hr, $day, $month - 1, $year); my $ep_stop = timelocal(0, $stop_min, $stop_hr, $day, $month - 1, $year); # If we just add another 86_400 here it fails around DST changes # Use the day/month/year fields, but the given stop hr+min. # Note that we add 1d + 1h + 1m, so that we *do* advance to the next day # even when it has 24 hours... # if ($ep_stop < $ep_start) { my @newday = localtime($ep_stop + 86_400 + 3_601); $ep_stop = timelocal(0, $stop_min, $stop_hr, @newday[3, 4, 5]); } # Only now can we decide when it really ends and choose to ignore # it as too old. # return if ($ep_stop <= $ignore_ending_before); $earliest = $ep_start if ($ep_start < $earliest); $latest = $ep_stop if ($ep_stop > $latest); # If a limited channel, work out whether this is truncated # This code assumes that no programme last for 24 hours or more, since # it ignores days. # my $clipped = ""; if (defined $ch_end) { my $skip = 0; if ($ch_start gt $ch_end) { # String compares work! # If stop time < start time then there are effectively 3 periods # (morning, day-gap, night) # Handle according to where the start time falls. # if ($rt_start lt $ch_end) { # Starts in morning service if ($rt_stop gt $ch_end) { $clipped = "(Programme end at $rt_stop will be missed) "; ($stop_hr, $stop_min) = split(":", $ch_end); } } elsif ($rt_start lt $ch_start) { # Starts in day-gap if ($rt_stop gt $ch_start) { $clipped = "(Programme start at $rt_start will be missed) "; ($start_hr, $start_min) = split(":", $ch_start); } else { $skip = 1; # Entirely in gap } } else { # Starts in night service - does it run over midnight if (($rt_stop lt $rt_start) and ($rt_stop gt $ch_end)) { $clipped = "(Programme end at $rt_stop will be missed) "; ($stop_hr, $stop_min) = split(":", $ch_start); } } } else { # Single day period if (($rt_start lt $ch_start) and ($rt_stop le $ch_start)) { $skip = 1; # Skip this program - all over before start } elsif (($rt_start ge $ch_end) and ($rt_stop gt $ch_end)) { $skip = 1; # Skip this program - all run after end } elsif (($rt_start ge $ch_end) and ($rt_stop le $ch_start)) { $skip = 1; # Skip this program - between today end # and tomorrow start } elsif ($rt_start lt $ch_start) { $clipped = "(Programme start at $rt_start will be missed) "; ($start_hr, $start_min) = split(":", $ch_start); } elsif ($rt_stop gt $ch_end) { $clipped = "(Programme end at $rt_stop will be missed) "; ($stop_hr, $stop_min) = split(":", $ch_end); } } return if ($skip); } # Around DLST changeover times they may include timezone at the start # of the title in (). # $rt_title =~ s/^\((GMT|UTC\+1|BST|UTC)\)\s*//; # Tweak episode/sub_title. # I've removed this code since the only time I've seen a match was for # a program about 9/11 (where it was obviously part of the actual title). # Just move it to the sub-title. # ### if ($rt_episode) { ### if ($rt_episode =~ m|^(\d+/\d+)\s*(.*)|) { ### $rt_episode = $1; ### $rt_sub_title = $2; ### } ### else { ### $rt_sub_title = $rt_episode; ### $rt_episode = ''; ### } ### } $rt_sub_title = $rt_episode; $rt_episode = ''; # From EMJB on Toppy Forum # GML - changed to precede description, so it will show under MS6 # # $rt_title .= ' (Rpt)' if ($rt_repeat eq 'true'); $rt_desc = '(Rpt) '.$rt_desc if ($rt_repeat eq 'true'); @non_utf8_mapping = @unknown = @weird = (); foreach my $tr_field ($rt_title, $rt_desc, $rt_sub_title) { next if ($tr_field eq ""); $tr_field =~ s/$patterns/do_map($1)/eg; } my $do_line = 0; if (@non_utf8_mapping) { print STDERR "Non-utf8 mapping done:"; print STDERR join("\n ", "", @non_utf8_mapping), "\n"; $do_line = 1; $tot_non_utf8_mapping += @non_utf8_mapping; } if (@weird) { printf STDERR "Weird mapping%s for:", (@weird > 1)? "s": ""; print STDERR join("\n ", "", @weird), "\n"; $do_line = 1; $tot_weird += @weird; } if (@unknown) { printf STDERR "Unknown mapping%s for:", (@unknown > 1)? "s": ""; print STDERR join("\n ", "", @unknown), "\n"; $do_line = 1; $tot_unknown += @unknown; } # Strip CRs off output - Debian mailx treats this as a non-text file # { (my $rpt_line = $this_line) =~ s/\r+$//; print STDERR " (in $lcn_to_name{$lcn} - channel $lcn):\n$rpt_line" if ($do_line); } # Iff the duration is > 240mins we need to split the programme, so that # MyStuff will still know about it after 4 hours # for (my $start = $ep_start; $start < $ep_stop; $start += 14_400) { my $mei_str = ""; { my ($t_min, $t_hr, $t_day, $t_mon, $t_yr); ($t_min, $t_hr, $t_day, $t_mon, $t_yr) = (localtime($start))[1, 2, 3, 4, 5]; # Construct the string for the line in the mei file $mei_str .= sprintf( "%4d%02d%02d%02d%02d", $t_yr + 1900, $t_mon + 1, $t_day, $t_hr, $t_min ); my $stop = $start + 14_400; $stop = $ep_stop if ($stop > $ep_stop); ($t_min, $t_hr, $t_day, $t_mon, $t_yr) = (localtime($stop))[1, 2, 3, 4, 5]; $mei_str .= sprintf( "|%4d%02d%02d%02d%02d", $t_yr + 1900, $t_mon + 1, $t_day, $t_hr, $t_min ); } $mei_str .= "|$lcn"; # LCN of channel $mei_str .= "|$rt_title"; # program title $mei_str .= "|$rt_sub_title"; # secondary title, e.g. episode title $mei_str .= "|".(($rt_repeat eq 'true')? 'TRUE': ''); # if this is a repeat $mei_str .= "|" ; # if this is first screening of a new show - not in RT data $mei_str .= "|$rt_year"; # production date $mei_str .= "|" .(($rt_premiere eq 'true')? $rt_genre.' Premiere': '') ; # if it is a premiere $mei_str .= "|"; # last chance $mei_str .= "|$clipped$rt_desc"; # program description # Outsourcing from BBC meant that Film was no longer a genre - but the flag # is set correctly. # $rt_genre = 'Film' if ($rt_film eq 'true'); $mei_str .= "|$rt_genre"; # program genre $mei_str .= "|$rt_duration_mins"; # program length $mei_str .= "|".(($rt_widescreen eq 'true')? '16:9': ''); # widescreen $mei_str .= "|" .(($rt_black_and_white eq 'true')? 'no': '') ; # whether it is in colour $mei_str .= "|" .(($rt_subtitles eq 'true')? 'teletext': '') ; # whether there are subtitles $mei_str .= "|$rt_episode"; # episode number $mei_str .= "|" .(($rt_episode ne '')? 'xmltv_ns': ''); # episodes start at 1 $mei_str .= "|".(($rt_certificate ne '')? 'BBFC': ''); # rating system $mei_str .= "|$rt_certificate"; # rating $mei_str .= "|" .(($rt_star_rating ne '')? $rt_star_rating.'/5': '') ; # star rating $mei_str = sprintf("%06d", length($mei_str))."|$mei_str\r\n"; print $mei_str; } } # @wget_list is a list of entries containing: # # $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name), $ch_start, $ch_end, # $expect_empty # # We push an entry onto it then shift the previous one off, so it should # only ever have at most 2 entries on it. # my @wget_list; my $current; while () { $_ =~ s/#.*//; $_ =~ s/^\s+//; $_ =~ s/\s+$//; my ($lcn, $rtcn, $name, $ch_start, $ch_end) = split(/\s*,\s*/, $_); next unless (defined $name); $lcn_to_name{$lcn} = $name; my $expect_empty = 0; if (substr($rtcn, -1, 1) eq '?') { $rtcn = substr($rtcn, 0, -1); $expect_empty = 1; } if ($rtcn < 0) { print STDERR "Skipping UNKNOWN $name($rtcn) for channel $lcn\n"; next; } unless (@wget_list) { # Get a current on first pass push @wget_list, [ $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name), $ch_start, $ch_end, $expect_empty ]; next; } # Start processing what we already have # my ($cur_lcn, $cur_rtcn, $cur_name, $cur_wget, $cur_st, $cur_end, $cur_ee ) = @{shift @wget_list}; $this_line = ''; $earliest = 2**32 - 1; $latest = 0; while (<$cur_wget>) { if (not @wget_list and (time() - $last_wget_start) >= $MIN_WGET_GAP) { push @wget_list, [ $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name), $ch_start, $ch_end, $expect_empty ]; } next unless (/\S/); next if (/^In accessing this XML feed/); $this_line = $_; # So we can show it on unknown mapping (v.u.) reformat($cur_lcn, $_, $cur_st, $cur_end); } close $cur_wget or die "wget failed: $!\n"; $all_earliest = $earliest if ($earliest < $all_earliest); $all_latest = $latest if ($latest > $all_latest); $earliest = 0 if ($latest == 0); # == Nothing found.... { my $days_data = ($latest - $earliest)/86_400; my $data_warn = ($days_data < 12)? "ONLY ": ""; printf STDERR "Found %s%0.1f days data for %s(%s) ch %d\n", $data_warn, $days_data, $cur_name, $cur_rtcn, $cur_lcn; } if (not $this_line) { if (not $cur_ee) { print STDERR "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was EMPTY!\n"; } else { print STDERR "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was empty (expected)\n"; } } if ($this_line and $cur_ee) { print STDERR "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was NOT EMPTY!\n"; } if (not @wget_list) { my $wait = ($MIN_WGET_GAP - (time() - $last_wget_start)); sleep $wait if ($wait > 0); push @wget_list, [ $lcn, $rtcn, $name, open_rt($lcn, $rtcn, $name), $ch_start, $ch_end, $expect_empty ]; } } # Handle the final one # my ($cur_lcn, $cur_rtcn, $cur_name, $cur_wget, $cur_st, $cur_end, $cur_ee) = @{shift @wget_list}; $this_line = ''; while (<$cur_wget>) { $this_line = $_; # So we can show it on unknown mapping (v.u.) reformat($cur_lcn, $_, $cur_st, $cur_end); } if (not $this_line and not $cur_ee) { print STDERR "RT data $cur_rtcn for $cur_name($cur_lcn) was EMPTY!\n"; } if ($this_line and $cur_ee) { print STDERR "RT data $cur_rtcn for $cur_name(ch $cur_lcn) was NOT EMPTY!\n"; } # Report on range of data # $all_earliest = 0 if ($all_latest == 0); # == Nothing found.... my $days_data = ($all_latest - $all_earliest)/86_400; my $data_warn = ($days_data < 12)? "ONLY ": ""; printf STDERR "*** STATS ***\n"; printf STDERR "%s%0.1f days data gathered\n", $data_warn, $days_data; if ($tot_non_utf8_mapping) { print STDERR "$tot_non_utf8_mapping non_utf8_mapping items found\n"; } if ($tot_unknown) { print STDERR "$tot_unknown unknown items found\n"; } if ($tot_weird) { print STDERR "$tot_weird weird items found"; print STDERR " ($repl_count unrepresentable replacements)" if ($repl_count); print STDERR "\n"; } __END__ # Documentation # # Since these *may* now have all been replaced with "correct" utf-8 # mapppings I'll comment them out and wait for reports. # # "\xc3\x97" => "\xb0", # degree sign (see also \xc3\xa6) # "\xc3\xa1" => "\xad", # A (soft) hyphen (also see \xc3\xf9) # "\xc3\xa6" => "\xb0", # Another(?) degree sign # "\xc3\xbb" => "\xbd", # 1/2 (a half) # "\xc3\xc6" => "'", # A quote (Windows intelligent? r quote) # "\xc3\xe0" => "...", # ellipsis # "\xc3\xe6" => "'", # A quote (Windows intelligent? l quote) # "\xc3\xf4" => '"', # Left double quote (‟ 0xE2 0x80 0x9F) # "\xc3\xf6" => '"', # Right double quote (” 0xE2 0x80 0x9D) # "\xc3\xf9" => "\xad", # A (soft) hyphen # "\xc3\xfa" => "\xa3", # Pound # "\xc3\xfb" => "\xad", # dash? # "\xc8\x98" => "\xf1", # n-tilde # "\xc8\xa1" => "\xed", # i-acute # "\xc8\xa4" => "\xde", # capital Thorn, but that is only in # # iso8859-1. So possibly a mistake. # # A ' was meant in the context seen. # "\xc8\xa8" => "\xe9", # e-acute # "\xc8\xa9" => "\xfb", # u_circumflex # "\xc8\xab" => "\xee", # i-circumflex # "\xc8\xac" => "\xea", # e-circumflex # "\xc8\xad" => "\xfc", # u-umlaut # "\xc8\xaf" => "\xef", # i-umlaut # "\xc8\xb5" => "\xf6", # o-umlaut # "\xc8\xb9" => "\xf8", # o-slash # "\xc8\xba" => "\xe7", # c-cedilla # "\xc8\xbd" => "\xeb", # e-umlaut # "\xc8\xbf" => "\xe8", # e-grave # "\xc8\xd1" => "\xe5", # a-ring # "\xc8\xe1" => "\xe0", # a-grave # "\xc8\xe8" => "\xca", # E-circumflex (albeit wrongly for the example) # "\xc8\xeb" => "\xc9", # E-acute # "\xc8\xed" => "\xe1", # a-acute # "\xc8\xf1" => "\xe4", # a-umlaut # "\xc8\xf3" => "\xe2", # a-circumflex # "\xc8\xfe" => "\xf3", # o-acute # Not only do top-bit chars get through, but so do html-entities. # Single byte numeric ones are handled generically, but we handle # multi-byte ones here, as we need to know what to map them too. # Some XML-entity encodings mentioned by xmltv # "—" => "--", # Long dash # "…" => "...", # Elipsis # "fi" => "fi", # Ligature for fi # # Description # Edited from Wooders PHP code. This runs wget to fetch things # asynchronously by processing the data form one while it sets off the # next one (but there are still 3 second gaps between fetch starts). # This doesn't handle favourites at all, so has a simplified config file # # Date Programmer Description # 16 Dec 11 birdman Film genre now set from flag. # 29 Jun 11 birdman Added bytes_to_text(). # 26 Jun 11 birdman Added \xc2\x85 => "\x8a\x8a". Might work... # 25 Jun 11 birdman Removed \xc2\(\x80-\x9f]) => $1 - \x40 # 07 Aug 10 birdman map \xc2\x91 to quote. # report per-channel data recoveries. # 25 Oct 09 birdman Put (Rpt) at start of description rather than in # title as that: # a) Aids searching on title # b) Show up in MS6 # 15 Oct 09 birdman Add 1d + 1h +1m on day wrap, to allow for 25 hour # days at DST change. # 12 Sep 09 birdman Change logic on ignoring "old" data to use end time # and handle cross-day date/timing. # 25 Feb 09 birdman Report total non-utf8/unknown/weird counts. # 07 Nov 08 birdman Map non-break space to space, as toppy seems # to have no nbsp. # 27 Oct 08 birdman Fix day-wrap issues on midnights after DST changes. # 25 Sep 08 birdman Change weird_e_acute (and its short form) to # long/short_repl_char and return "not sign". # 09 Aug 08 birdman Added 4-hr-max loop, and perltidy'ed # 29 Jun 08 birdman weird_unknown changed to weird_e_acute. # 22 Jun 08 birdman \xc2\(\x80-\x9f]) mapping added. # 27 May 08 birdman Added weird_unknown - map to \xa5. # 14 May 08 birdman Single wrx regex for weird_map and report # any non-matched 6-char long chars from it. # 16 Apr 08 birdman Added "long" euro mapping to weird_map # 01 Apr 08 birdman Made some sense of the weird_map entries # 14 Feb 08 birdman Distinguish weird from truely unknown # 31 Jan 08 birdman More info on char mappings done. # 10 Jan 08 birdman Now using (simpler) utf-8 codes, mostly. # 04 Jan 08 birdman Fix $patterns so that # in regex isn't taken as # a comment and tidy up report (multiply unknowns # now reported before single printing of entry). # 26 Oct 07 birdman Report on empty channels # 26 Sep 07 birdman Fixed the code for limited channels that run # over midnight # ?? Apr 07 birdman Guessing the date, but original perl version # ##################################################################