#!/usr/bin/perl -w eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}' if 0; # not running under some shell =pod =head1 NAME tv_grab_uk_rt - Grab TV listings for the United Kingdom, from an alternative source. =head1 SYNOPSIS tv_grab_uk_rt --help tv_grab_uk_rt [--config-file FILE] --configure tv_grab_uk_rt [--config-file FILE] [--output FILE] [--quiet] [--days N] [--offset N] [--slow [--limit-details HH:MM-HH:MM] --get-categories] =head1 DESCRIPTION Output TV and radio listings in XMLTV format for many stations available in Britain. The data comes from the Radio Times website. =head1 USAGE First you must run B to choose which stations you want to receive. Then running B with no arguments will get about a fortnightE<39>s listings for the stations you chose. B<--configure> Prompt for which stations to download and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_uk_rt.conf>. This is the file written by B<--configure> and read when grabbing. B<--output FILE> When grabbing, write output to FILE rather than standard output. B<--days N> When grabbing, grab N days rather than as many as possible. B<--offset N> Start grabbing at today + N. N may be negative. B<--slow> Fetch additional details for each programme. This requires one extra web page fetch per programme, so use with care. B<--limit-details HH:MM-HH:MM> (use with --slow) Limit the additional details fetched for programmes by checking that the start time falls within the specified time range, which must be specified in 24-hour clock, eg: 16:00-02:00. This can significantly reduce the number of web page fetches. B<--get-categories> (use with --slow) Attempt to find out the category each program is in. This requires another web page get per category per day, so it can really slow down the grab. B<--quiet> suppress the progress messages normally written to standard error. B<--help> print a help message and exit. =head1 SEE ALSO L, L, L =head1 AUTHOR Ed Avis, ed@membled.com =head1 BUGS The website parsing isnE<39>t perfect and there will often be warning messages about bits of HTML that arenE<39>t understood. Some of the details provided by the site have to be thrown away because they cannot be accommodated in the XMLTV format; again, warning messages are printed. Rather than the all-or-nothing --slow mode, it would be better to fetch the details only for those programmes that are interesting, in some kind of two-pass grabbing. There is code to find out the 'categories' given for each programme, but that involves even more page fetches so it is disabled at present. =cut use strict; use XMLTV::Version '$Id: tv_grab_uk_rt.in,v 1.65 2004/05/23 16:23:15 epaepa Exp $ '; use IO::Socket; use Date::Manip; use Getopt::Long; require HTML::Entities; use XMLTV; use XMLTV::Memoize; use XMLTV::Ask; use XMLTV::DST; use XMLTV::Config_file; use XMLTV::Get_nice; use XMLTV::Date qw(parse_date); use XMLTV::Usage < 1, refreshOpener => 1, refreshOpener2 => 1, ); # Check options. First do the undocumented --cache option, then the # normal ones. # my $using_cache = XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); my ($opt_days, $opt_help, $opt_output, $opt_share, $opt_configure, $opt_config_file, $opt_offset, $opt_quiet, $opt_slow, $opt_detailtimerange, $opt_detailstarttime, $opt_detailstoptime, $opt_get_categories, ); # No default for $opt_days, we determine it from the site. $opt_offset = 0; # default today $opt_quiet = 0; # default $opt_slow = 0; # default GetOptions('days=i' => \$opt_days, 'help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, 'output=s' => \$opt_output, 'share=s' => \$opt_share, # also undocumented 'offset=i' => \$opt_offset, 'quiet' => \$opt_quiet, 'slow' => \$opt_slow, 'limit-details=s' => \$opt_detailtimerange, 'get-categories' => \$opt_get_categories, ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); if ($opt_help) { usage(1); } die "--limit-details makes no sense without --slow\n" if defined $opt_detailtimerange and not $opt_slow; # Date::Manip has a bug where 'now' will be wrong if you change the # timezone. It won't be correctly converted from the system timezone # to the new one. So we call parse_date('today midnight') _before_ # Date_Init(). # my $today = DateCalc(parse_date('today midnight'), "$opt_offset days"); my $now = parse_date('now'); Date_Init('TZ=+0000'); # share/ directory for storing channel mapping files. This next line # is altered by processing through tv_grab_uk_rt.PL. But we can use # the current directory instead of share/tv_grab_uk for development. # # The 'source' file tv_grab_uk_rt.in has $SHARE_DIR undef, which means # use the current directory. In any case the directory can be # overridden with the --share option (useful for testing). # my $SHARE_DIR='/home/ed/share/xmltv'; # by grab/uk_rt/tv_grab_uk_rt.PL $SHARE_DIR = $opt_share if defined $opt_share; my $OUR_SHARE_DIR = (defined $SHARE_DIR) ? "$SHARE_DIR/tv_grab_uk_rt" : '.'; (my $CHANNEL_NAMES_FILE = "$OUR_SHARE_DIR/channel_ids") =~ tr!/!/!s; # Tables to convert between Radio Times and XMLTV ids of channels. # The way to access these is through the routines rt_to_xmltv() and # xmltv_to_rt(), not directly. Those will deal sensibly with a new RT # channel that isn't mentioned in the file. # my (%rt_to_xmltv, %xmltv_to_rt, %extra_dn); my $line_num = 0; foreach (XMLTV::Config_file::read_lines($CHANNEL_NAMES_FILE, 1)) { ++ $line_num; next unless defined; my $where = "$CHANNEL_NAMES_FILE:$line_num"; my @fields = split /:/; die "$where: wrong number of fields" if @fields < 2 or @fields > 3; my ($xmltv_id, $rt_id, $extra_dn) = @fields; warn "$where: RT id $rt_id seen already\n" if defined $rt_to_xmltv{$rt_id}; $rt_to_xmltv{$rt_id} = $xmltv_id; warn "$where: XMLTV id $xmltv_id seen already\n" if defined $xmltv_to_rt{$xmltv_id}; $xmltv_to_rt{$xmltv_id} = $rt_id; $extra_dn{$xmltv_id} = $extra_dn if defined $extra_dn; } # File that stores which channels to download. my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_uk_rt', $opt_quiet); if ($opt_configure) { configure(); } else { my %g_args = (); if (defined $opt_output) { my $fh = new IO::File ">$opt_output"; die "cannot write to $opt_output\n" if not $fh; %g_args = (OUTPUT => $fh); } grab(\%g_args, [ XMLTV::Config_file::read_lines($config_file) ]); } # Display stats printf (STDERR "Accessed %d web pages, downloaded %d Kb, duration %d secs\n",$numwebgets,$kbwebgets,time()-$starttime) unless $opt_quiet; exit(); # Grab listings and write them in XML. Parameters: # # ref to hash of arguments to be passed to XMLTV::Writer (but encoding # is always ISO-8859-1), # ref to list of lines from config file. # sub grab( $$ ) { my ($w_args, $config_lines) = @_; my $writer = new XMLTV::Writer(%$w_args, encoding => 'ISO-8859-1'); my %write_channels; # to be written as elements # FIXME turn into progress bar. print STDERR "finding channels:\t" unless $opt_quiet; my %channels = get_channels(); print STDERR "got ". (scalar keys %channels) . " done.\n" unless $opt_quiet; if ( $opt_slow ) { if ( $opt_detailtimerange ) { if ( $opt_detailtimerange =~ m/[012][0-9]:[0-5][0-9]-[012][0-9]:[0-5][0-9]/ ) { ( $opt_detailstarttime,$opt_detailstoptime) = split('-',$opt_detailtimerange); # It is allowed for stop < start, you can have a time # range spanning midnight. # } else { die "Invalid argument to --limit-details: $opt_detailtimerange (should be HH:MM-HH:MM)"; } t "time range = $opt_detailstarttime - $opt_detailstoptime"; } else { print ( STDERR < # ALL # my $line_num = 1; foreach (@$config_lines) { ++ $line_num; next if not defined; my $where = "$config_file:$line_num"; if ($_ eq 'ALL') { %write_channels = %channels; } elsif (/^channel\s+(.+)/) { my $xmltv_id = $1; if (not defined $channels{$xmltv_id}) { warn "$where: no channel with XMLTV id $xmltv_id, skipping\n"; next; } $write_channels{$xmltv_id} = $channels{$xmltv_id}; } else { die "$where: bad line\n" } } # FIXME turn this into progress bar. print STDERR "getting dates for which listings available:\t" unless $opt_quiet; my @available_dates = get_available_dates(); t 'available dates: ' . d \@available_dates; die 'apparently, there are no days of listings on the site' if not @available_dates; print STDERR "got " . scalar @available_dates . " done.\n" unless $opt_quiet; my $is_available = sub( $ ) { my $d = shift; foreach (@available_dates) { return 1 if not Date_Cmp($d, $_); } return 0; }; my @dates_to_get; for (my $d = $today; $is_available->($d); $d = DateCalc($d, '+ 1 day')) { push @dates_to_get, $d; } die "listings for today ($today) not available" if not @dates_to_get; my $first_day = $dates_to_get[0]; my $last_day = $dates_to_get[-1]; foreach (@available_dates) { if (Date_Cmp($last_day, $_) < 0) { warn "strangely, day $_ is available but there are gaps before it"; } } if (defined $opt_days) { if ($opt_days > @dates_to_get) { warn 'only ' . (scalar @dates_to_get) . ' days of consecutive listings available'; } else { @dates_to_get = @dates_to_get[0 .. $opt_days - 1]; } } my $days = @dates_to_get > 1 ? 'days' : 'day'; say('getting ' . (scalar @dates_to_get) . " $days of listings\n") unless $opt_quiet; t 'getting dates:' . d \@dates_to_get; $writer->start({ 'source-info-url' => "$BASE_URL/", 'source-info-name' => 'Radio Times', 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://membled.com/work/apps/xmltv/', }); my %categories; if ($opt_get_categories) { # Find all the categories. FIXME turn into progress bar. print STDERR "getting category names:\t" unless $opt_quiet; %categories = get_categories(); print STDERR "got ". (scalar keys %categories) . " done.\n" unless $opt_quiet; } # Find all the programes on the channels my %prog_to_cat; my %seen_prog; # get the listings for each date my @programmes; my %latest; foreach my $date (@dates_to_get) { # Create the time string according to the TIME_INTERVAL my $mns = 0; my $base_day = UnixDate($date, '%Q'); t "comparing $base_day with " . UnixDate($now, '%Q'); if ($base_day eq UnixDate($now, '%Q')) { # Don't bother downloading programmes that have already # happened. This happens only for today ($now), we assume # offset is not negative. # t "bumping from $base_day enough minutes to almost reach $now"; for (;;) { my $new_mns = $mns + $TIME_INTERVAL; my $new_date = DateCalc($base_day, "+ $new_mns minutes"); if (Date_Cmp($new_date, $now) > 0) { # No, adding those extra minutes would take us # past now. # last; } $mns = $new_mns; } } while ($mns < 24 * 60) { my $time = DateCalc($date, "+ $mns minutes"); $mns += $TIME_INTERVAL; my $timeend = DateCalc($date, "+ $mns minutes"); # first find all the new programmes for this time slot my @new_programmes; CHAN: foreach my $chan (sort keys %write_channels) { # Assume the user's preferred language is the same as # the RT site... this language-selection charade is a # bit pointless I admit. # my $dn = $write_channels{$chan}->{'display-name'}; my $name = XMLTV::best_name([ $LANG ], $dn)->[0]; $name = $chan if not defined $name; # Skip channels which already downloaded this time slot if (not $latest{$chan}) { $latest{$chan} = 0 } if ($latest{$chan} >= UnixDate($timeend, '%q')) { #print STDERR 'time ', UnixDate($time, '%q'), ", channel $name:\tSkipping\n" # unless $opt_quiet; next CHAN; } # FIXME turn into progress bar. print STDERR 'time ', UnixDate($time, '%q'), ", channel $name:\t" unless $opt_quiet; my @new_channel_programmes = get_programmes($chan, $time, \%prog_to_cat, \%categories, \%channels); # if first timeslot if ( $mns eq $TIME_INTERVAL && $date eq $first_day) { # we need to take care of any programs that start # before midnight of the previous day. # # this occurs if program starts before midnight, # and flows into current timeslot. # # this will only be the case for the 'first' timeslot # as the program will then be 'seen' and never # re-got -- provided we updated the %latest hash # # we make the assumption that any prog starting # after midday is actually starting yesterday # # Normally we discard such a programme because we # would have grabbed it the previous day; however # if today is the first day then we keep it. # my $midday = DateCalc($date, "+ 12 hours"); foreach (@new_channel_programmes) { if (Date_Cmp($_->{start}, $midday) > 0 && Date_Cmp($_->{start}, $_->{stop}) > 0) { # Programme starts after midday; # and stops before it. since we # stop going through the list as soon as # we find the first programme starting in # the morning, we can assume this is a # leftover from the previous day. t "today is first day, keeping yesterday's programme " . d $_; $_->{start} = utc_offset(DateCalc($_->{start}, "- 24 hours") . " UTC", '+0000'); } else { t 'seen a programme starting this morning: ' . d $_; last; } } } # check for programs that end after midnight, # update date of those that traverse midnight # bounday and throw away the rest -- after all # they start 'tomorrow', not 'today' my $midnight = DateCalc($date, "+ 24 hours"); my $past_midnight =undef; foreach (@new_channel_programmes) { if (Date_Cmp($_->{start},$midnight) <0 && Date_Cmp($_->{start}, $_->{stop}) > 0 ) { # starts before midnight, but stop{stop} = utc_offset(DateCalc($_->{stop}, "+ 24 hours") . ' UTC', '+0000'); t "updated stop time to tomorrow " . d \$_; $past_midnight=1; } else { # not crossing midnight boundary -- # could be from today, or tomorrow... if ( $past_midnight ) { # must be tomorrow! $_->{stop} = utc_offset(DateCalc($_->{stop}, "+ 24 hours") . ' UTC', '+0000'); $_->{start} = utc_offset(DateCalc($_->{start}, "+ 24 hours") . ' UTC', '+0000'); } } } # now got a reasonable programme list, # check for already seen progID's # and get details if necessary. foreach (@new_channel_programmes) { # update 'latest' hash my $lasttime = UnixDate($_->{stop}, '%q'); if ($latest{$chan} < $lasttime) { $latest{$chan} = $lasttime; } die unless $_->{_progID}; # have we already handled this prog? if ( not $seen_prog{$chan}->{$_->{_progID}} ) { $seen_prog{$chan}->{$_->{_progID}}++; # see if we need to get details for this program if ( test_get_details($_) ) { # get details for this prog my $progs = get_programme_details( $chan, $_->{_progID}, \%prog_to_cat, \%categories, \%channels); #t "get_programme_details returned " . d $progs; if (not $progs) { warn "could not get details for programme $_->{_progID} on channel $chan\n"; # add summary only push @new_programmes, $_; } elsif (not @$progs) { warn "strange, $_->{_progID} on channel $chan seems to be empty"; # add summary only push @new_programmes, $_; } else { print STDERR "@" unless $opt_quiet; push @new_programmes, @$progs; } } else { # don't get details for this program push @new_programmes, $_; } } # if not already seen } # for each new programme print STDERR "\n" unless $opt_quiet; } # Next find all the programes in each category (if there are new progs # FIXME we dont need to check all channels only those with new progs # FIXME we also need to check if there are more than 100 results and # search over less channels # if (@new_programmes) { if ($opt_get_categories) { t "searching categories"; %prog_to_cat = get_progs_in_cat([ sort keys %write_channels ], $time, \%categories); # Set the category for each new programme foreach my $new_prog (@new_programmes) { # can only be sure of categories for listed programmes, # followons do not have a _progID, and may be a different # type to their parent. # so only get categs for non-clumped and 0'th clump if ( not defined $new_prog->{clumpidx} or $new_prog->{clumpidx} =~ m/0\/[0-9]/ ) { my $channelId = $new_prog->{_chanID}; my $programmeId = $new_prog->{_progID}; if (!defined($channelId) or !defined($programmeId)) { warn "Cannot add categories for prog: \"$new_prog->{title}[0][0]\" at \"$new_prog->{start}\""; warn " --> got undefined programmeId\n" if ( !defined($programmeId)); warn " --> got undefined channelId\n" if (!defined($channelId)); t d $new_prog; } elsif (exists($prog_to_cat{"$channelId$programmeId"})) { my $cat_ID = $prog_to_cat{"$channelId$programmeId"}; my $cat = $categories{$cat_ID}; if (defined $cat) { push @{$new_prog->{category}}, [ $cat ]; } else { warn "unknown category id $cat\n"; } } } } t "done adding categories to programmes"; } } # push the new channels into the completlist push (@programmes, @new_programmes); } } # write out the xml # write out the channels $writer->write_channels(\%write_channels); foreach (@programmes) { foreach my $k (keys %$_) { die "undef $_->{$k}" if not defined $_->{$k}; } $writer->write_programme($_); } $writer->end(); } # Function to get a url. This also seems like a sensible place to do # HTML-demoronizing. # my $warned_bad_chars; sub get_url( $ ) { my $url = shift; for (my $tmp = get_nice($url)) { die "cannot get $url" if not defined; $numwebgets++; #update stats $kbwebgets+= (length $_)/1024; tr/\222\222\226/''-/; tr/\010//d; tr/\t/ /; if (s/([^\012\015\040-\176\240-\377]+)//g) { warn "removing bad characters: '$1'" unless $warned_bad_chars++; } return $_; } } # Function to find all the programmes on a channel (at a given date + # time). # # Parameters: # XMLTV id of channel # Date::Manip object giving date and time # prog_to_cat hash (see elsewhere for details) # categories hash # channels hash # # Returns: list of programmes # # I think this relies on the page returning exactly $TIME_INTERVAL # worth of listings. # sub get_programmes( $$$$$ ) { my $channel_xid = shift; my $time = shift; my $prog_to_cat = shift; my $categories = shift; my $channels = shift; my $day = UnixDate($time, '%Q'); die if not $day; my $channelId = xmltv_to_rt($channel_xid); my @p; my $url = "$BASE_URL/ListingsServlet?event=4&"; $url .= 'jspGridLocation=%2Fjsp%2Ftv_listings_grid.jsp&'; $url .= 'jspListLocation=%2Fjsp%2Ftv_listings_single.jsp&'; $url .= 'jspError=%2Fjsp%2Ferror.jsp&'; $url .= 'searchDate=' . UnixDate($time, '%d/%m/%Y') . '&'; $url .= 'searchTime=' . UnixDate($time, '%R') . '&'; $url .= 'channels=' . $channelId; # FIXME commonize this local $SIG{__WARN__} = sub { warn "$url: $_[0]"; }; local $SIG{__DIE__} = sub { die "$url: $_[0]"; }; my $data; eval { $data = get_url($url); # This check is mostly for the benefit of those using --cache. die 'strange, get_url() not supposed to return undef' if not defined $data; }; if ($@) { warn "could not get $url\n"; my $from_time = UnixDate($time, '%q'); my $to_time = UnixDate(DateCalc($time, "+ $TIME_INTERVAL seconds"), '%q'); warn "not fetching any programmes for channel $channel_xid " . "between $from_time and $to_time\n"; return (); } $data =~ tr/\n//d; print STDERR '#' unless $opt_quiet; my @results = ($data =~ /.*?/ig); if (not @results) { if ($data =~ /There are no programmes available/) { # Assume that this is because nothing is showing on that # channel, not because the site is missing some data. # } else { warn "$url: no results found in HTML\n"; } return (); } foreach (@results) { m/programmeId=([0-9]+)/ or die "$url: cannot find programmeId= in $_"; my $programmeId = $1; # The title attribute can be malformed (" characters). / title="([^>]+)">/ or die "$url: cannot find title= in $_"; my $link_title = $1; my ($start, $stop); for ($link_title) { # example: # $link_title="Gazon Maudit (French Twist) (10:00pm-11:50pm)" s/\s+\(\s*([0-9:. apm]+)\s*-\s*([0-9:. apm]+)\s*\)\s*$//i or die "expected to see '(start-stop)' times in title: $_"; ($start, $stop) = ($1, $2); foreach ($start, $stop) { $_ = utc_offset("$day $_", '+0000'); } } $url = "$BASE_URL/ListingsServlet?event=10&"; $url .= "channelId=$channelId&"; $url .= "programmeId=$programmeId&"; $url .= 'jspLocation=/jsp/prog_details.jsp'; push @p, { channel => $channel_xid, start => $start, stop => $stop, title => [ [ $link_title, $LANG ] ], url => [ $url ], _progID => $programmeId, _chanID => $channelId, }; print STDERR '#' unless $opt_quiet; } return @p; } # Function to parse the HTML and get all the info we need # # Parameters: # XMLTV id of channel # RT id of programme # hash mapping 'channelidprogrammeid' to RT category id # hash of categories # hash of channels # # Returns a listref of programmes: normally with just one element, but # can be more when two programmes share a timeslot. (The clumpidxes # will be set.) # my %warned_ch_mismatch; # eliminate duplicate warnings my %warned_windowschars; my %warned_unicodechars; sub get_programme_details( $$$$$ ) { # local $Log::TraceMessages::On = 1; my $channel_xid = shift; my $channelId = xmltv_to_rt($channel_xid); my $programmeId = shift; our %prog_to_cat; local *prog_to_cat = shift; our %categories; local *categories = shift; our %channels; local *channels = shift; # %p is the main programme we will return. my %p; $p{channel} = $channel_xid; $p{_chanID} = $channelId; $p{_progID} = $programmeId; # @followons are small extra programmes sharing its slot. Things # like news bulletins which come in the middle of a film are also # counted as 'after' it, for simplicity. # my @followons; if ($opt_get_categories) { my $cat_ID = $prog_to_cat{"$channelId$programmeId"}; if ($cat_ID and exists($categories{$cat_ID})) { my $cat = $categories{$cat_ID}; push @{$p{category}}, [ $cat ]; } } my $detail_url = "$BASE_URL/ListingsServlet?event=10&"; $detail_url .= "channelId=$channelId&"; $detail_url .= "programmeId=$programmeId&"; $detail_url .= 'jspLocation=/jsp/prog_details.jsp'; my $prog_details_string; eval { $prog_details_string = get_url($detail_url); die 'strange, get_url() not supposed to return undef' if not defined $prog_details_string; }; if ($@) { warn "cannot get $detail_url\n"; return undef; } # FIXME commonize this local $SIG{__WARN__} = sub { warn "$detail_url: $_[0]"; }; local $SIG{__DIE__} = sub { die "$detail_url: $_[0]"; }; $prog_details_string =~ tr/\r//d; if (not $prog_details_string =~ m{\s*(}s) { warn "cannot find main table in content of $detail_url, skipping\n"; return undef; } my $prog_details = $1; for ($prog_details) { my $C = ''; # used to stop strings running together # Remove scripting. s{}{$C}g; # Turn hyperlinks with only 'alt' text into ones with content. s!]*alt="(.+?)"[^>]*>!$2!g; # Remove javascript around hyperlink urls s!]*>([^<]*)}{$C$1$C}g; # Replace elements with just the URL and link text. s{]*>([^<]*)}{$C$1$C$2$C}g; t 'after href munging: ' . d $_; # Look for the *** stars ratings s{]*>} {$C Star Rating:$C$2$C$1$C}g; # Now we're ready to strip all markup. We use pipe characters # as a delimiter between bits of text. So first, check there # aren't any in there already. # tr/|//d; # Replace comments and HTML tags with pipes. s//\|/g; s/<[ ]*[^0-9][^>]*?>/\|/g; # decode any HTML special chars (&  ) HTML::Entities::decode_entities($_); # note   -> \240 tr/\240/ /; # get rid of known Windows encoded characters # silly windows characters to simple quotes tr/\221\222\223\224\226\227/\'\'\"\"\-\-/; tr/\010//d; # replace invalid windows chars oe ligatures s/\234/oe/g; s/\214/OE/g; # replace windows' "..." character s/\205/.../g; # HTML::Entities::decode_entities does not handle numeric unicode # punctuation in in XHTML # ie ‖ -> unicode:2012 = "-" # see http://www.unicode.org/charts/PDF/U2000.pdf s/&\#82(09|1[0123])/-/g; # 8209-8213 = 2011->2015 { my @unicodechars = m/&\#[0-9]+;/g; if ( @unicodechars ) { foreach ( @unicodechars ) { warn "stripping unknown unicode character (" . $_ . ") from input" unless $warned_windowschars{$_}++; } s/&\#[0-9]+;/\?/g; } } { my @windowschars = m/[\200-\237]/g; if ( @windowschars ) { foreach ( m/[\200-\237]/g ) { warn "stripping invalid windows character (" . ord($_) . " - $_) from input: $_" unless $warned_windowschars{ord($_)}++; } s/[\200-\237]/\?/g; } } # Tidy up the pipes and whitespace. Hey, ASCII art! s/\s+/ /g; s/\s+\|/\|/g; s/\|\s+/\|/g; tr/|/|/s; s/^\|//; s/\|$//; # local $Log::TraceMessages::On = 1; t 'after barification: ' . d $_; } my @bits = split /\|/, $prog_details; if (not @bits) { warn 'no programme details found in HTML'; return undef; } my $title = shift @bits; if (@bits and $bits[0] eq 'Star Rating:') { shift @bits; $p{'star-rating'} = [ shift(@bits) . '/5' ] ; shift @bits; } my $sub_title; if (@bits and $bits[0] ne 'Channel:') { $sub_title = shift @bits; } # The title might give us a hint about the timezone, or we might # have to guess. # my $tz; if ($title =~ s/^\((UTC|GMT|BST|[+-]0000|[+]0100)\)\s*//) { $tz = $1; } $p{title} = [ [ $title, $LANG ] ]; my ($channel_name, $date, $times, $cert, $sub_title_1, $desc, $director, $filmed_in, $cast); # Map heading to [ where to put it, multiplicity ]. my %fields = (Channel => [ \$channel_name, '1' ], Date => [ \$date, '1' ], Time => [ \$times, '1' ], Certificate => [ \$cert, '?' ], Episode => [ \$sub_title_1, '?' ], Review => [ \$desc, '?' ], # hmm 'Directed by' => [ \$director, '?' ], 'Filmed in' => [ \$filmed_in, '?' ], ); FIELD: foreach my $f (sort keys %fields) { my ($var, $mult) = @{$fields{$f}}; for (my $i = 0; $i < @bits; $i++) { die if not defined $bits[$i]; if ($bits[$i] =~ /^$f:? *$/) { my $val = $bits[$i + 1]; if (not defined $val) { warn "found $f: but nothing after it"; return undef; } $$var = $val; splice @bits, $i, 2; next FIELD; } } if ($mult eq '1') { # Mandatory item, and we didn't find it. warn "could not find $f: in programme details"; return undef; } elsif ($mult eq '?') { # No worry. } else { die "bad multiplicity specifier $mult"; } } # Check the channel name found matches the channel we thought. my $ch = $channels{$channel_xid}; die "no channel data for $channel_xid" if not defined $ch; my $dn = $ch->{'display-name'}->[0]->[0]; die "no display name for $channel_xid" if not defined $dn; # Normalize a channel name. my $ncn = sub( $ ) { local $_ = shift; s/\bOne\b/1/g; return $_; }; if ($ncn->($dn) ne $ncn->($channel_name)) { warn "channel name '$channel_name' for programme doesn't match $dn" unless $warned_ch_mismatch{$channel_name}{$dn}++; } my ($start, $start_tz); my ($stop, $stop_tz); if ($times =~ /^(.*) to (.*)$/) { my $pair; t "start time $1, calling rt_date()"; if (not defined ($pair = rt_date($date, $1, $tz))) { warn "cannot parse date $date with start time $1"; return undef; } ($start, $start_tz) = @$pair; t "got date $start with tz $start_tz"; t "stop time $2, calling rt_date()"; if (not defined ($pair = rt_date($date, $2, $tz))) { warn "cannot parse date $date with start time $1"; return undef; } ($stop, $stop_tz) = @$pair; t "got date $stop with tz $stop_tz"; } else { warn "bad Time value $times"; return undef; } # Some programmes have thir stop time on the next day. (This test # may break when the timezones change.) # if (Date_Cmp($start, $stop) > 0) { $stop = utc_offset(DateCalc($stop, '+ 1 day') . ' UTC', '+0000'); die if not defined $stop; } $p{start} = UnixDate($start, "%q $start_tz"); $p{stop} = UnixDate($stop, "%q $stop_tz"); if (defined $cert) { warn "already seen certificate" if defined $p{rating}; for ($cert) { if (not s/^\[(.+)\]$/$1/) { warn "bad certificate text: $_"; } else { push @{$p{rating}}, [ $_, 'BBFC' ]; } } } if (not defined $sub_title and not defined $sub_title_1) { # No secondary title. } elsif (not defined $sub_title and defined $sub_title_1) { $p{'sub-title'} = [ [ $sub_title_1, $LANG ] ]; } elsif (defined $sub_title and not defined $sub_title_1) { $p{'sub-title'} = [ [ $sub_title, $LANG ] ]; } elsif (defined $sub_title and defined $sub_title_1) { if ($sub_title eq $sub_title_1) { $p{'sub-title'} = [ [ $sub_title, $LANG ] ]; } else { warn "two sub-titles: $sub_title, $sub_title_1"; $p{'sub-title'} = [ [ $sub_title, $LANG ], [ $sub_title_1, $LANG ] ]; } } else { die } if (defined $desc) { $p{'desc'} = [ [ $desc, $LANG ] ]; } if (defined $director) { push @{$p{credits}{director}}, $director; } if (defined $filmed_in) { warn "already seen filmed-in date" if defined $p{date}; if ($filmed_in !~ /^\d+$/) { warn "bad filmed-in value '$filmed_in'\n"; } else { $p{date} = $filmed_in; } } if (defined $cast) { if ($cast =~ /(?:\.){5}/) { # The style giving part.....actor. There used to be code # for this, but it seems the website has stopped producing # it. # warn "discarding cast $cast"; } else { $p{credits}->{actor} = [ split /,\s*/, $cast ]; } } my ($options,$subtitles,$widescreen,$repeat,$black_and_white, $episode,$review); BIT: while (@bits) { my $bit = shift @bits; if ($bit eq 'Cast List') { # Some of the following bits are a cast list. t 'calling do_cast()'; do_cast(\%p, \@bits); t 'after do_cast(), remaining bits: ' . d \@bits; } elsif ($bit =~ /^Related [wW]ebsites$/ or $bit =~ /^Related [fF]eatures$/) { t 'calling do_link()'; do_link(\%p, \@bits); t 'after do_link(), remaining bits: ' . d \@bits; } elsif ($bit =~ /^javascript:/) { t 'javascript: link with no preceding text, pushing back'; t 'calling do_link()'; unshift @bits, $bit; do_link(\%p, \@bits); t 'after do_link(), remaining bits: ' . d \@bits; } elsif ($bit eq 'Add to my diary') { my $url = shift @bits; if (not defined $url) { warn "strange, no URL in 'Add to my diary'"; } elsif ($url !~ /^javascript:/) { warn "strange, add to diary URL not javascript"; } } else { t "unknown bit $bit, try do_misc()"; push @followons, do_misc(\%p, $bit); } } foreach (keys %p) { die "undef $_" if not defined $p{$_}; } if (@followons) { my $num = 1 + @followons; my $i = 0; foreach (\%p, @followons) { $_->{clumpidx} = "$i/$num"; ++ $i; } } return [ \%p, @followons ]; } # Process a single bit, probably containing flags like 'Repeat'. # Warns about unknown stuff. # # Parameters: # programme (will be modified) # bit of text (will be modified) # # Also returns any follow-on programmes which are found. # my $warned_deaf_signed; my $warned_audio_described; my $warned_discarding_updated_listing; my $warned_extradesc; my $warned_see_also; sub do_misc( $$ ) { our %p; local *p = shift; local $_ = shift; my @r; while (length) { if (s/^Subtitled,?\s*//) { warn 'seen subtitling twice' if defined $p{subtitles}; $p{subtitles} = [ { type => 'teletext' } ]; } elsif (s/^Widescreen,?\s*//) { warn 'seen widescreen twice' if defined $p{_widescreen}; # FIXME I think this can be handled under