#!/usr/bin/perl -w eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}' if 0; # not running under some shell =pod =head1 NAME tv_grab_es_digital - Grab TV listings for Spain (Digital+). =head1 SYNOPSIS tv_grab_es_digital --help tv_grab_es_digital [--config-file FILE] --configure tv_grab_es_digital [--config-file FILE] [--output FILE] [--days N] [--offset N] [--quiet] tv_grab_es_digital --list-channels =head1 DESCRIPTION Output TV listings for several channels available in Spain (Digital+). Currently those channels are available either at Hispasat and Astra on DVB format. The listings are currently coming directly from Digital+ (cplus.es) The grabber relies on parsing HTML so it might stop working at any time. First run B to choose, which channels you want to download. Then running B with no arguments will output listings in XML format to standard output. B<--configure> Prompt for which channels, and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_es_digital.conf>. This is the file written by B<--configure> and read when grabbing. B<--output FILE> write to FILE rather than standard output. B<--days N> grab N days. The default is 3. B<--offset N> start N days in the future. The default is to start from today. B<--quiet> suppress the progress messages normally written to standard error. B<--help> print a help message and exit. =head1 SEE ALSO L. =head1 AUTHOR Ramon Roca, Ramon.Roca@XCombo.com, based on tv_grab_es =head1 BUGS =cut # Author's TODOs & thoughts # # get the icons of each grabbed channel from the website # # findout how to setup properly the language, (catalan, basque, galician, vo) # # get also the program descriptions. to make it faster, and even more complete # this can be accomplished by merging pdf files provided at the # site. That will be also smarter for the website since it will # avoid numerous gets and make run the grabber much faster. # # # get channel ids in RFC2838 format (I don't, actually the Id comes directly # web site, i don't know where to go for getting th id's for spanish # tv broadcasters. # # do the listings for cable digital TV. Good source for it looks to me # www.mediapark.es # # ###################################################################### # initializations use strict; use XMLTV::Version '$Id: tv_grab_es_digital,v 1.4 2004/05/09 17:49:12 epaepa Exp $ '; use Getopt::Long; use Date::Manip; use HTML::TreeBuilder; use HTML::Entities; # parse entities use IO::File; use XMLTV; use XMLTV::Memoize; use XMLTV::Ask; use XMLTV::Config_file; use XMLTV::DST; use XMLTV::Get_nice; use XMLTV::Mode; # Todo: perhaps we should internationalize messages and docs? use XMLTV::Usage < eval { require Term::ProgressBar; 1 }; # Attributes of the root element in output. my $HEAD = { 'source-info-url' => 'http://http://www.plus.es/codigo/television/rejilla_dia_dia.asp', 'source-data-url' => "http://http://www.plus.es/codigo/television/rejilla_dia_dia.asp", 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://membled.com/work/apps/xmltv/', }; # Whether zero-length programmes should be included in the output. my $WRITE_ZERO_LENGTH = 0; # default language my $LANG="es"; # Global channel_data our @ch_all; ###################################################################### # get options # Get options, including undocumented --cache option. XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); my ($opt_days, $opt_offset, $opt_help, $opt_output, $opt_configure, $opt_config_file, $opt_quiet, $opt_list_channels); $opt_days = 3; # default $opt_offset = 0; # default $opt_quiet = 0; # default GetOptions('days=i' => \$opt_days, 'offset=i' => \$opt_offset, 'help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, 'list-channels' => \$opt_list_channels ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); usage(1) if $opt_help; my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', $opt_list_channels => 'list-channels', ); # File that stores which channels to download. my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_es_digital', $opt_quiet); my @config_lines; # used only in grab mode if ($mode eq 'configure') { XMLTV::Config_file::check_no_overwrite($config_file); } elsif ($mode eq 'grab') { @config_lines = XMLTV::Config_file::read_lines($config_file); } elsif ($mode eq 'list-channels') { # Config file not used. } else { die } # Whatever we are doing, we need the channels data. my %channels = get_channels(); # sets @ch_all my @channels; ###################################################################### # write configuration if ($mode eq 'configure') { open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; # Ask about each channel. my @chs = sort keys %channels; my @names = map { $channels{$_} } @chs; my @qs = map { "add channel $_?" } @names; my @want = askManyBooleanQuestions(1, @qs); foreach (@chs) { my $w = shift @want; warn("cannot read input, stopping channel questions"), last if not defined $w; # No need to print to user - XMLTV::Ask is verbose enough. # Print a config line, but comment it out if channel not wanted. print CONF '#' if not $w; my $name = shift @names; print CONF "channel $_ $name\n"; # TODO don't store display-name in config file. } close CONF or warn "cannot close $config_file: $!"; say("Finished configuration."); exit(); } # Not configuration, we must be writing something, either full # listings or just channels. # die if $mode ne 'grab' and $mode ne 'list-channels'; # Options to be used for XMLTV::Writer. my %w_args; if (defined $opt_output) { my $fh = new IO::File(">$opt_output"); die "cannot write to $opt_output: $!" if not defined $fh; $w_args{OUTPUT} = $fh; } $w_args{encoding} = 'ISO-8859-1'; my $writer = new XMLTV::Writer(%w_args); $writer->start($HEAD); if ($mode eq 'list-channels') { $writer->write_channel($_) foreach @ch_all; $writer->end(); exit(); } ###################################################################### # We are producing full listings. die if $mode ne 'grab'; # Read configuration my $line_num = 1; foreach (@config_lines) { ++ $line_num; next if not defined; if (/^channel:?\s+(\S+)\s+([^\#]+)/) { my $ch_did = $1; my $ch_name = $2; $ch_name =~ s/\s*$//; push @channels, $ch_did; $channels{$ch_did} = $ch_name; } else { warn "$config_file:$line_num: bad line\n"; } } ###################################################################### # begin main program # Assume the listings source uses CET (see BUGS above). my $now = DateCalc(ParseDate('now'), "$opt_offset days"); die "No channels specified, run me with --configure\n" if not keys %channels; my @to_get; # the order in which we fetch the channels matters foreach my $ch_did (@channels) { my $ch_name=$channels{$ch_did}; my $ch_xid="$ch_did.cplus.es"; $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $ch_name ] ] }); my $day=UnixDate($now,'%Q'); for (my $i=0;$i<$opt_days;$i++) { push @to_get, [ $day, $ch_xid, $ch_did ]; #for each day $day=nextday($day); die if not defined $day; } } # This progress bar is for both downloading and parsing. Maybe # they could be separate. # my $bar = new Term::ProgressBar('getting listings', scalar @to_get) if Have_bar && not $opt_quiet; foreach (@to_get) { foreach (process_table($_->[0], $_->[1], $_->[2])) { $writer->write_programme($_); } update $bar if Have_bar && not $opt_quiet; } $writer->end(); ###################################################################### # subroutine definitions # Use Log::TraceMessages if installed. BEGIN { eval { require Log::TraceMessages }; if ($@) { *t = sub {}; *d = sub { '' }; } else { *t = \&Log::TraceMessages::t; *d = \&Log::TraceMessages::d; Log::TraceMessages::check_argv(); } } #### # process_table: fetch a URL and process it # # arguments: # Date::Manip object giving the day to grab # xmltv id of channel # cplus.es id of channel # # returns: list of the programme hashes to write # sub process_table { my ($date, $ch_xmltv_id, $ch_es_id) = @_; $ch_es_id =~ s/\+/\%2B/gi; my $today = UnixDate($date, '%d/%m/%Y'); my $url = "http://www.plus.es/codigo/television/rejilla_dia_dia.asp?Canales=$ch_es_id&Fecha=$today"; t $url; my $data=get_nice($url); if (not defined $data) { die "could not fetch $url, aborting\n"; } local $SIG{__WARN__} = sub { warn "$url: $_[0]"; }; # parse the page to a document object my $tree = HTML::TreeBuilder->new(); $tree->parse($data); my @program_data = get_program_data($tree); my $bump_start_day=0; my @r; while (@program_data) { my $cur = shift @program_data; my $next = shift @program_data; unshift @program_data,$next if $next; push @r, make_programme_hash($date, $ch_xmltv_id, $ch_es_id, $cur, $next); if (!$bump_start_day && bump_start_day($cur,$next)) { $bump_start_day=1; $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q'); } } return @r; } sub make_programme_hash { my ($date, $ch_xmltv_id, $ch_es_id, $cur, $next) = @_; my %prog; $prog{channel}=$ch_xmltv_id; $prog{title}=[ [ $cur->{title}, $LANG ] ]; $prog{"sub-title"}=[ [ $cur->{subtitle}, $LANG ] ] if defined $cur->{subtitle}; $prog{category}=[ [ $cur->{category}, $LANG ] ]; $prog{start}=utc_offset("$date $cur->{time}", 'CET'); if (not defined $prog{start}) { warn "bad time string: $cur->{time}"; return undef; } # FIXME: parse description field further die if defined $cur->{desc} and $cur->{desc} !~ /\S/; $prog{desc}=[ [ $cur->{desc}, $LANG ] ] if defined $cur->{desc}; return \%prog; } sub bump_start_day { my ($cur,$next) = @_; if (!defined($next)) { return undef; } my $start = UnixDate($cur->{time},'%H:%M'); my $stop = UnixDate($next->{time},'%H:%M'); if (Date_Cmp($start,$stop)>0) { return 1; } else { return 0; } } # # program data is split as follows: # - program listings begin with a fixed header with "Hora", "Género" and # "Título". sub get_program_data { my ($tree) = @_; my @data; my @txt_elems = get_txt_elems($tree); # Actually time and title are required, but we don't check that. my $index = 0; while (defined $txt_elems[$index]) { if ( ($txt_elems[$index] eq "Hora") && ($txt_elems[$index + 1] =~ /nero/) && ($txt_elems[$index + 2] =~ /tulo/) ) { t "Program listing comes below"; $index = $index + 3; while ( $txt_elems[$index] =~ /^\d\d:\d\d/ ) { t "Program found: Hora: $txt_elems[$index] Programa: $txt_elems[$index+2]"; my $p_stime = $txt_elems[$index]; my $p_category = $txt_elems[$index + 1]; $p_category =~ s/\240//gi; if ($p_category !~ /\S/) { $p_category = "SIN CLASIFICAR"; } my $p_title = $txt_elems[$index + 2]; my $p_desc; # my $p_subtitle; if (not ( $txt_elems[$index + 3] =~ /^\d\d:\d\d/ ) ) { # Program has Description, or at least whitespace # where there should be a description. $p_desc = $txt_elems[$index + 3]; undef $p_desc if $p_desc !~ /\S/; $index = $index + 4; } else { # Program don't have Description $index = $index + 3; } # Check that a title is present, if not we can use # description instead. # if ($p_title !~ /\S/) { if (defined $p_desc) { $p_title = $p_desc; undef $p_desc; } else { warn "programme with no title at $p_stime, not writing"; next; } } my %h = ( time => $p_stime, category=> $p_category, title=> $p_title, # subtitle=> $p_subtitle, ); $h{desc} = $p_desc if defined $p_desc; push @data, \%h; # t "Next time?: $txt_elems[$index]"; } # end while prof the program } t $txt_elems[$index]; $index = $index + 1; } return @data; } sub get_txt_elems { my ($tree) = @_; my @txt_elem; my @txt_cont = $tree->look_down( sub { ($_[0]->descendants() eq 0 ) }, sub { defined($_[0]->attr ("_content") ) } ); foreach my $txt (@txt_cont) { my @children=$txt->content_list; for (my $tmp=$children[0]) { s/^\s+//;s/\s+$//; push @txt_elem, $_; } } return @txt_elem; } # get channel listing sub get_channels { my $bar = new Term::ProgressBar('getting list of channels', 1) if Have_bar && not $opt_quiet; my %channels; my $url="http://www.plus.es/codigo/television/plataformas/digitalplus/portada2.asp"; t $url; my $local_data=get_nice($url); die "could not get channel listing $url, aborting\n" if not defined $local_data; my $tree = HTML::TreeBuilder->new(); $tree->parse($local_data); my @menus = $tree->find_by_tag_name("_tag"=>"select"); foreach my $elem (@menus) { my $cname = $elem->attr('name'); if ($cname eq "Canales") { my @ocanals = $elem->find_by_tag_name("_tag"=>"option"); @ocanals = sort @ocanals; foreach my $opt (@ocanals) { t $opt->attr('value'); t $opt->attr('text'); if ((not $opt->attr('value') eq "") and (not $opt->attr('value') eq "OKPLUS")) { my $channel_id = $opt->attr('value'); my @children=$opt->content_list; my $channel_name=$children[0]; if (length $channel_id eq 1) { $channel_id = "0" . $channel_id } $channels{$channel_id}=$channel_name; push @ch_all, { 'display-name' => [ [ $channel_name, $LANG ] ], 'id'=> "$channel_id.cplus.es" }; } } } } die "no channels could be found" if not keys %channels; update $bar if Have_bar && not $opt_quiet; return %channels; } # Bump a YYYYMMDD date by one. sub nextday { my $d = shift; my $p = ParseDate($d); my $n = DateCalc($p, '+ 1 day'); return UnixDate($n, '%Q'); }