#!/usr/bin/perl -w -C eval 'exec /usr/bin/perl -w -C -S $0 ${1+"$@"}' if 0; # not running under some shell =pod =head1 NAME tv_grab_jp - Grab TV listings for Japan. =head1 SYNOPSIS tv_grab_jp --help tv_grab_jp [--config-file FILE] --configure tv_grab_jp [--config-file FILE] [--output FILE] [--days N] [--offset N] [--enable-readstr] [--no-category-code] [--quiet] tv_grab_jp --list-channels =head1 DESCRIPTION Output TV listings for several channels available in Japan. The grabber relies on parsing HTML so it might stop working at any time. First run B to choose, which channels you want to download. Then running B with no arguments will output listings in XML format to standard output. tv_grab_jp always grab 7 days of listings. B<--configure> Prompt for which channels, and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_jp.conf>. This is the file written by B<--configure> and read when grabbing. B<--output FILE> write to FILE rather than standard output. B<--days N> grab N days. The default and the maximum is 7. B<--offset N> start N days in the future. The default is to start from today. B<--enable-readstr> add Hiragana(read strings) for program title. B<--no-category-code> suppress category code. English translated category names will be added to the listings without this option. B<--quiet> suppress the progress messages normally written to standard error. B<--list-channels> write output giving elements for every channel available, but no programmes. B<--help> print a help message and exit. =head1 SEE ALSO L. =head1 AUTHOR Takeru Komoriya Based on tv_grab_fi by Matti Airas and tv_grab_sn by Stefan G:orling. =head1 BUGS The data source may not suit recommended XMLTV DTD format. =cut ###################################################################### # initializations use strict; use XMLTV::Version '$Id: tv_grab_jp,v 1.4 2004/05/09 17:49:13 epaepa Exp $ '; use Getopt::Long; use Date::Manip; use HTML::TreeBuilder; use HTML::Entities; # parse entities use IO::File; use XMLTV; use XMLTV::Memoize; use XMLTV::Ask; use XMLTV::Config_file; use XMLTV::Mode; use XMLTV::Get_nice; use Text::Kakasi; use utf8; use Encode qw(from_to); use Encode::JP; # Todo: perhaps we should internationalize messages and docs? use XMLTV::Usage < eval { require Term::ProgressBar; 1 }; # Attributes of the root element in output. my $HEAD = { 'source-info-url' => 'http://www.ontvjapan.com/', 'source-data-url' => "http://www.ontvjapan.com/program/", 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://membled.com/work/apps/xmltv/', }; # The timezone in Japan. my $TZ="+0900"; # language for XML output my $XMLLANG = "ja_JP"; # message language my $lang = $ENV{'LANG'}; $lang = '' if not defined $lang; # base URL of source data my $urlbase = "http://www.ontvjapan.com"; # xmltv channel id extension my $channel_ext = ".ontvjapan.com"; # Make sure Encode supports the encodings we want. my @encs = Encode->encodings(); my $wanted = 'euc-jp'; if (not grep { $_ eq $wanted } @encs) { die "did not see $wanted in list of encodings supported by Encode: @encs\n"; } # Global channel data. our @ch_all; # category translation data # (movie,series,sports,tvshow) my %categories = ('ドラマ' => 'series', '映画' => 'movie', 'スポーツ' => 'sports', '演劇・伝統芸' => 'etc', '音楽' => 'tvshow', 'バラエティー' => 'tvshow', '教養' => 'tvshow', 'アニメ・キッズ' => 'series', '社会・報道' => 'tvshow', '趣味・暮らし' => 'tvshow', 'その他' => 'etc'); ###################################################################### # get options XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); my ($opt_days, $opt_offset, $opt_help, $opt_output, $opt_configure, $opt_config_file, $opt_quiet, $opt_readstr, $opt_no_cat_code, $opt_list_channels); $opt_days = 7; # default $opt_offset = 0; # default $opt_readstr = 0; # default $opt_no_cat_code = 0; # default $opt_quiet = 0; # default GetOptions('help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, 'enable-readstr' => \$opt_readstr, 'no-category-code' => \$opt_no_cat_code, 'list-channels' => \$opt_list_channels, 'days=i' => \$opt_days, 'offset=i' => \$opt_offset, ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); die 'number of offset must not be negative' if (defined $opt_offset && $opt_offset < 0); usage(1) if $opt_help; my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', $opt_list_channels => 'list-channels', ); # File that stores which channels to download. my $config_file; my @config_lines; # used only in grab mode if ($mode eq 'configure') { $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_jp', $opt_quiet); XMLTV::Config_file::check_no_overwrite($config_file); } elsif ($mode eq 'grab') { $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_jp', $opt_quiet); @config_lines = XMLTV::Config_file::read_lines($config_file); } ###################################################################### # write configuration if ($mode eq 'configure') { my $msg; open CONF, ">:utf8", "$config_file" or die "cannot write to $config_file: $!"; # Get region information my $barmsg = select_lang('地域情報を取得中', 'getting regions'); my $bar = new Term::ProgressBar($barmsg, 1) if Have_bar; my %regions = get_regions(); update $bar if Have_bar; # Ask region my %ask_regions; my @r; my $regionid = '0002'; my $req_region; Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-Ja', '-Ha', '-Ka'); foreach (sort keys %regions) { my $region = $regions{$_}; if ($lang ne 'ja_JP.UTF-8') { utf8::encode($region); from_to($region, "utf8", "euc-jp"); # convert to EUC if ($lang ne 'ja_JP.eucJP') { $region=Text::Kakasi::do_kakasi($region); # convert to Romaji } } $ask_regions{$region} = $_; push @r, $region; if ($_ eq $regionid) { $req_region = $region; } } Text::Kakasi::close_kanwadict(); $msg = select_lang("\n地域を指定してください: ", "\nSelect your region: "); $req_region = askQuestion($msg, $req_region, @r); $regionid = $ask_regions{$req_region}; if ($regionid eq 'CATV') { $msg = select_lang("README.CATV.jaを参照して4桁のID番号を入力してください: ", "See README.CATV and input region ID: "); $regionid = ask($msg); if (length($regionid) != 4) { $msg = select_lang("IDが間違っています.\n", "Invalid region ID.\n"); die $msg; } } print CONF "region $regionid\n"; $barmsg = select_lang('チャンネル情報を取得中', 'getting channels'); $bar = new Term::ProgressBar($barmsg, 1) if Have_bar; my @channels = get_channels($regionid); update $bar if Have_bar; # Ask about each channel. Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-Ja', '-Ha', '-Ka', '-Ea'); my @qs; foreach (@channels) { my $name = $_->{station}; my $callsign = $_->{callsign}; if (($lang ne 'ja_JP.UTF-8') and ($lang ne 'ja_JP.eucJP')) { utf8::encode($name); from_to($name, "utf8", "euc-jp"); # convert to EUC $name = Text::Kakasi::do_kakasi($name); # convert to Romaji } my $msg = select_lang("「$name($callsign)」を追加しますか? ", "add $name($callsign)? "); push @qs, $msg; } Text::Kakasi::close_kanwadict(); my @want = askManyBooleanQuestions(1, @qs); foreach (@channels) { my $w = shift @want; my $id = $_->{id}; my $name = $_->{station}; my $callsign = $_->{callsign}; warn("cannot read input, stopping channel questions"), last if not defined $w; # Print a config line, but comment it out if channel not wanted. print CONF '#' if not $w; print CONF "channel $id\n"; } close CONF or warn "cannot close $config_file: $!"; $msg = select_lang("設定完了.\n", "Finished\n"); print $msg; exit(); } # Not configuration, we must be writing something, either full # listings or just channels. # die if $mode ne 'grab' and $mode ne 'list-channels'; # Options to be used for XMLTV::Writer. my %w_args; if (defined $opt_output) { my $fh = new IO::File(">$opt_output"); die "cannot write to $opt_output: $!" if not defined $fh; $w_args{OUTPUT} = $fh; } $w_args{encoding} = 'UTF-8'; my $writer = new XMLTV::Writer(%w_args); $writer->start($HEAD); ###################################################################### # Channel listings if ($mode eq 'list-channels') { # Get region information my $barmsg = select_lang('地域情報を取得中', 'getting regions'); my $bar = new Term::ProgressBar($barmsg, 1) if Have_bar; my %regions = get_regions(); update $bar if Have_bar; # get channels for all region my @ch_regions = grep { $_ ne 'CATV' } sort keys %regions; $barmsg = select_lang('チャンネル情報を取得中', 'getting channels'); $bar = new Term::ProgressBar($barmsg, scalar keys %regions) if Have_bar; foreach (@ch_regions) { my $regionid = $_; my @channels = get_channels($regionid); foreach my $ch (@channels) { my $id = $ch->{'id'}; my $exist = 0; foreach (@ch_all) { $exist = 1 if ($id eq $_->{'id'}); } if (not $exist) { my $name = $ch->{'station'}; my $callsign = $ch->{'callsign'}; utf8::encode($name); push @ch_all, { 'id' => $id, 'station' => $name, 'callsign' => $callsign }; } } update $bar if Have_bar; } # Write channels mode. foreach (@ch_all) { write_channel($_); } $writer->end(); exit(); } # We are producing full listings. die if $mode ne 'grab'; ###################################################################### # begin main program # Read channels from configuration, push them to @ch_all my $regionid = ''; my @ch_conf; my $line_num = 1; foreach (@config_lines) { ++ $line_num; next if not defined; if (/^region:?\s+(\S+)/) { $regionid = $1; } elsif (/^channel:?\s+(\S+)/) { push @ch_conf, $1; } else { warn "$config_file:$line_num: bad line\n"; } } my $ch_number = @ch_conf; my $diemsg = select_lang("チャンネルが設定されていません.\n" . "まず --configure オプションをつけて設定をしてください.\n", "Channels are not configured. Run with --configure first.\n"); die $diemsg if ($ch_number == 0) or ($regionid eq ''); my $barmsg = select_lang('チャンネル情報を取得中', 'getting channels'); my $bar = new Term::ProgressBar($barmsg, 1) if Have_bar; my @channels = get_channels($regionid); update $bar if Have_bar; foreach (@channels) { my $id = $_->{id}; my $name = $_->{station}; utf8::encode($name); my $callsign = $_->{callsign}; foreach (@ch_conf) { if ($_ eq $id) { push @ch_all, { 'id' => $id, 'station' => $name, 'callsign' => $callsign }; } } } # the order in which we fetch the channels matters write_channel($_) foreach (@ch_all); # This progress bar is for both downloading and parsing. $barmsg = select_lang('番組表を取得中', 'getting program info'); $bar = new Term::ProgressBar($barmsg, scalar @ch_all) if Have_bar && not $opt_quiet; foreach (@ch_all) { process_table($_->{'id'}); update $bar if Have_bar && not $opt_quiet; } $writer->end(); ###################################################################### # subroutine definitions # Use Log::TraceMessages if installed. BEGIN { eval { require Log::TraceMessages }; if ($@) { *t = sub {}; *d = sub { '' }; } else { *t = \&Log::TraceMessages::t; *d = \&Log::TraceMessages::d; Log::TraceMessages::check_argv(); } } # write out channel data sub write_channel { my ($ch) = @_; $writer->startTag('channel', 'id' =>$ch->{'id'}); $writer->dataElement('display-name', $ch->{'station'}, 'lang'=>$XMLLANG); $writer->dataElement('display-name', $ch->{'callsign'}, lang => 'en'); $writer->endTag('channel'); } #### # process_table: fetch a URL and process it # # arguments: # id of channel # returns: list of program hashes to write # sub process_table { my ($channel_id) = @_; my $data = get_table($channel_id); # parse the page to a document object my $tree = HTML::TreeBuilder->new(); $tree->parse($data); parse_program_table($channel_id, $tree); } # parse program table sub parse_program_table { my ($channel, $tree) = @_; t "parse_program_table() ENTRY for tree: $tree"; # hash for checking program continuity of the day my %date_changed; my %prog_time; # elements with tag of which 'href' contains 'detail.php3' my @elems = $tree->look_down( '_tag','a'); # parse by elements and push program infomation to @data foreach my $elem (@elems) { my $href = $elem->attr('href'); next if not defined $href; next if not $href =~ m/detail.php3/; t 'doing elem: ' . d $elem; # get program infomation get_content($elem, $channel, \%date_changed, \%prog_time); } } sub get_content { my ($elem, $channel, $date_changed, $prog_time) = @_; my $p; # program title my $title = ''; foreach $p ($elem->content_list()) { if (ref $p) { # News/Weather icons my $src = $p->attr('src'); if ( $src =~ m/n.gif/ ) { $title .= "[N]"; } elsif ( $src =~ m/w.gif/ ) { $title .= "[天]"; } } else { # title from_to($p, "euc-jp", "utf8"); # convert to utf-8 utf8::decode($p); $title .= $p; } } $title =~ s/\s+/ /g; # convert Hankaku spaces to Zenkaku utf8::encode($title); return undef if $title eq ''; # contents my $desc = ''; foreach $p ($elem->right()) { if (not ref $p) { $p =~ s/^\s+//; last if ($p =~ m/^\d\d:\d\d/); # if time is inserted, then next program from_to($p, "euc-jp", "utf8"); # convert to utf-8 $desc .= $p; } } # air date my $hsid = $elem->attr('href'); $hsid =~ m/hsid=(\d{8})(\d{4})(\d{3})/; return undef if (not defined $1) or (not defined $3) ; my $date = $1; my $prog_id = $3; # incremental program counter of the day # air time and genre my $time_genre = $elem->attr('title'); $time_genre =~ m/^(\d\d:\d\d)-(\d\d:\d\d)\s*(\S*)/; my $starttime = $1; my $endtime = $2; my $genre = $3; from_to($genre, "euc-jp", "utf8"); # convert to UTF-8 $genre =~ s/\/.*$//; # delete after '/' return undef if (not defined $starttime) or (not defined $endtime); # start/end time $starttime =~ s/://; $endtime =~ s/://; my $startdate = $date; my $enddate = $date; # for the program over midnight if ($starttime > $endtime) { # real date of program end $enddate = UnixDate(DateCalc($date,"+ 1 day"), '%Q'); } # date transition check if (not defined $$date_changed{$date}) { if ((not defined $$prog_time{$date}) or ($$prog_time{$date} < $starttime)) { $$prog_time{$date} = $starttime; } else { # real date is changed here $$date_changed{$date} = $prog_id; } } # set real date if ((defined $$date_changed{$date}) and ($prog_id >= $$date_changed{$date})) { $startdate = UnixDate(DateCalc($date,"+ 1 day"), '%Q'); $enddate = UnixDate(DateCalc($date,"+ 1 day"), '%Q'); } # check the date for --days and --offset option my $today = ParseDate("today"); my $output_sdate = UnixDate(DateCalc($today, "+ $opt_offset days"), '%Q'); my $edays = $opt_offset + $opt_days - 1; my $output_edate = UnixDate(DateCalc($today, "+ $edays days"), '%Q'); if ($opt_offset > 0 || $opt_days < 7) { return undef if ($startdate < $output_sdate); return undef if ($startdate > $output_edate); } # read string (Romaji) of title my $t = $title; utf8::decode($t); $t =~ s/\[.+\]//g; # delete [NEWS] etc. $t =~ s/◇//g; # delete '◇' $t =~ s/[0-9]//g; # delete figures that represent time # We cannot use \d here, because it's locale-aware and # matches figures in wide charactors. Use [0-9] instead. utf8::encode($t); Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-JH', '-KH', '-aE'); from_to($t, "utf8", "euc-jp"); # convert to EUC my $readstr = ''; if ($t ne '') { $readstr=Text::Kakasi::do_kakasi($t); from_to($readstr, "euc-jp", "utf8"); # convert to UTF-8 } Text::Kakasi::close_kanwadict(); t "TITLE:$title($readstr) $date $starttime-$endtime DESC:$desc GENRE:$genre\n"; # write out program my $start = $startdate . $starttime . "00 " . $TZ; my $stop = $enddate . $endtime . "00 " . $TZ; $writer->startTag('programme', 'start'=>$start, 'stop'=>$stop, 'channel'=>$channel); $writer->dataElement('title', $title, 'lang'=>$XMLLANG); $writer->dataElement('title', $readstr, 'lang'=>$XMLLANG . '@kana') if ($readstr ne '') and $opt_readstr; $desc =~ s/\s+$//; $writer->dataElement('desc', $desc, 'lang'=>$XMLLANG) if $desc ne ''; $writer->dataElement('category', $genre, 'lang'=>$XMLLANG) if $genre ne ''; if (($genre ne '') and (not $opt_no_cat_code)) { # Convert category code to English for MythTV's benefit. utf8::decode($genre); my $cat_code = $categories{$genre}; $writer->dataElement('category', $cat_code, lang => 'en') if (defined $cat_code and $cat_code ne ''); } $writer->endTag('programme'); } # get channel listing sub get_channels { my ($regionid) = @_; my @channels; my $local_data = get_channel_table($regionid); my $tree = HTML::TreeBuilder->new(); $tree->parse($local_data); # channels elements are specially formatted tags # with href="gridChannel.php..." my @ch_a_elems = $tree->look_down(_tag => 'a'); # Zenkaku -> Hankaku conversion Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-Ea'); # get channels while (@ch_a_elems) { my $elem = shift @ch_a_elems; my $href = $elem->attr('href'); if ($href =~ m/^gridChannel.php\?tikicd=${regionid}&ch=(\d\d\d\d)/) { my $channel_id = $1; $channel_id .= $channel_ext; # this is xmltv channel id my $callsign = ($elem->content_list)[0]; $callsign = Text::Kakasi::do_kakasi($callsign); $callsign =~ s/[\(\)]//g; $elem = shift @ch_a_elems; my $channel = ($elem->content_list)[0]; # convert to UTF-8 from_to($channel, "euc-jp", "utf8"); from_to($callsign, "euc-jp", "utf8"); utf8::decode($channel); utf8::decode($callsign); # special fix for NHK-ETV call sign $callsign = 'ETV' if ($callsign eq 'NHK' and $channel eq 'NHK教育'); # special fix for Housou Daigaku $callsign = 'UAIR' if ($channel eq '放送大学'); push @channels, { 'id' => $channel_id, 'station' => $channel, 'callsign' => $callsign }; } } Text::Kakasi::close_kanwadict(); my $chnum = @channels; my $diemsg = select_lang("チャンネル情報の取得に失敗しました\n" . "ネットワークの接続と地域番号を確認してください", "Failed to get channel information.\n" . "Please check if region id and network connections are correct."); die $diemsg if $chnum == 0; return @channels; } # get regions sub get_regions { my %regions; my $local_data = get_area_list(); my $tree = HTML::TreeBuilder->new(); $tree->parse($local_data); # area elements are formatted tags # with href="javascript:sendUrl(0,'$id')" my @area_elems = $tree->look_down(_tag => 'a'); # get channels foreach (@area_elems) { my $href = $_->attr('href'); if ($href =~ m/^javascript:sendUrl.*'(.{3})'/) { my $region_id = '0' . $1; my $region_name = ($_->content_list)[0]; # convert to UTF-8 from_to($region_name, "euc-jp", "utf8"); utf8::decode($region_name); $regions{$region_id} = $region_name; } } my $diemsg = select_lang("地域情報の取得に失敗しました\n" . "ネットワークの接続を確認してください", "Failed to get region information.\n" . "Please check if network connections are correct."); die $diemsg if scalar keys %regions == 0; # for CATV station $regions{'CATV'} = 'CATV'; return %regions; } # get program table sub get_table { my ($channelid) = @_; my $ext = quotemeta $channel_ext; $channelid =~ s/$ext//; my $url = "$urlbase/program/gridChannel.php?ch=${channelid}&genre=all"; my $content = XMLTV::Get_nice::get_nice($url); return $content; } # get channel table sub get_channel_table { my ($regionid) = @_; # request channel data my $url = "$urlbase/program/gridChannel.php?tikicd=${regionid}"; my $content = XMLTV::Get_nice::get_nice($url); return $content; } # get area list sub get_area_list { # request area selection page my $url = "$urlbase/areachange/areaselect.php3?tv=1&force=1"; my $content = XMLTV::Get_nice::get_nice($url); return $content; } # Select proper language and encoding of output message sub select_lang { my ($ja, $eng) = @_; if ($lang eq 'ja_JP.UTF-8') { return $ja; } elsif ($lang eq 'ja_JP.eucJP') { utf8::encode($ja); from_to($ja, "utf8", "euc-jp"); # convert to EUC return $ja; } else { return $eng; } }