#!/usr/bin/perl -w eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}' if 0; # not running under some shell =pod =head1 NAME tv_grab_nl_wolf - Grab TV listings for Holland. =head1 SYNOPSIS tv_grab_nl_wolf --help tv_grab_nl_wolf [--output FILE] [--days N] [--offset N] [--quiet] =head1 DESCRIPTION Output TV listings for several channels available in the Netherlands. The data comes from Carlo de WolfE<39>s site exporting listings in XMLTV format. The default is to grab as many days as possible from the current day onwards. B<--output FILE> write to FILE rather than standard output. B<--days N> grab N days starting from today, rather than as many as possible. B<--offset N> start grabbing N days from today, rather than starting today. N may be negative. B<--quiet> suppress the progress messages normally written to standard error. B<--help> print a help message and exit. =head1 SEE ALSO L, L =head1 AUTHOR Ed Avis, ed@membled.com =head1 BUGS The grabber fetches data from the site with little processing. This means that if the upstream data is incorrect, the output of the grabber will be wrong. There are some fixes to correct errors such as stop < start, but others may creep in undetected. =cut use strict; use XMLTV::Version '$Id: tv_grab_nl_wolf,v 1.14 2004/07/24 10:20:49 axis3x3 Exp $ '; # We work by inheriting from XMLTV::Grab_XML and overriding certain # methods. # use XMLTV::Grab_XML; package Grab_XML_nl_wolf; use base 'XMLTV::Grab_XML'; # No need to be nice. $XMLTV::Get_nice::Delay = 0; use Date::Manip; use HTML::TreeBuilder; use XMLTV::TZ qw(tz_to_num); # Memoize one routine if possible. eval { require Memoize }; unless ($@) { for ('tz_to_num') { Memoize::memoize($_) or warn "cannot memoize $_"; } } # Todo: perhaps we should internationalize messages and docs? sub country( $ ) { my $pkg = shift; return 'the Netherlands'; } my $URL_HOST = 'http://wolf.xs4all.nl'; my $URL_DIR = '/xmltv/'; my $url_base = "$URL_HOST$URL_DIR"; # Returns a hash mapping YYYMMDD to URL. sub urls_by_date( $$$ ) { my $pkg = shift; my $index = $pkg->get($url_base); die "could not get index page $url_base, aborting\n" if not defined $index; # Use a horrible method to read the directory listing; we want the # filename and size. Any method we choose will break if the # format changes. # my @lines = split /\n/, $index; my @got; while (@lines >= 2) { if ($lines[0] =~ m!\1!) { my $filename = $1; if ($lines[1] =~ m!(\S+) kb!) { my $size_kb = $1; push @got, [ $filename, $size_kb ]; } } shift @lines; } die "saw no download links in $url_base\n" if not @got; my @urls; foreach (@got) { my ($filename, $size_kb) = @$_; if ($size_kb ne '0.0' and $size_kb ne '0') { push @urls, "/xmltv/$filename"; } else { warn "$filename has zero size on site, not downloading\n"; } } my %by_date; foreach (@urls) { s/^$URL_DIR//; if (/^tv-(\d{8})\./) { # We know that the same date can occur twice, we just take # the first occurrence. # $by_date{$1} = "$url_base/$_" unless defined $by_date{$1}; } } return %by_date; } # Fix up programmes which have stop time before their start time. We # assume the start time is correct. # my $warned_bad_stop_time = 0; sub xml_from_data( $$ ) { my $pkg = shift; my $xml = shift; return $pkg->remove_early_stop_times( $xml ); } Grab_XML_nl_wolf->go();