Package: xmltv-util Version: 0.5.58-1 Severity: normal Tags: upstream patch
Due to a recent change in telkku.com, listings for Finland cannot be fetched. This was fixed upstream in CVS revision 1.57. Attached is a patch against 0.5.58. -- System Information: Debian Release: squeeze/sid APT prefers testing APT policy: (990, 'testing'), (500, 'unstable'), (1, 'experimental') Architecture: amd64 (x86_64) Kernel: Linux 2.6.36.1+bfs+aufs2.1 (SMP w/2 CPU cores; PREEMPT) Locale: LANG=fi_FI.UTF-8, LC_CTYPE=fi_FI.UTF-8 (charmap=UTF-8) Shell: /bin/sh linked to /bin/dash Versions of packages xmltv-util depends on: ii libarchive-zip-perl 1.30-3 Perl module for manipulation of ZI ii libdate-manip-perl 6.11-1 module for manipulating dates ii libdatetime-format-strpt 1.5000-1 Perl module to parse and format st ii libhtml-parser-perl 3.66-1 collection of modules that parse H ii libhtml-tableextract-per 2.10-3 module for extracting the content ii libhtml-tree-perl 3.23-2 Perl module to represent and creat ii libhttp-cache-transparen 1.0-1 Perl module used to transparently ii libio-stringy-perl 2.110-4 Perl modules for IO from scalars a ii libparse-recdescent-perl 1.965001+dfsg-1 Perl module to create and use recu ii libsoap-lite-perl 0.712-2 Perl implementation of a SOAP clie ii libterm-readkey-perl 2.30-4 A perl module for simple terminal ii libtext-bidi-perl 0.03-5 Unicode bidi algorithm for Perl us ii libtext-iconv-perl 1.7-2 converts between character sets in ii libwww-mechanize-perl 1.64-1 module to automate interaction wit ii libwww-perl 5.836-1 Perl HTTP/WWW client/server librar ii libxml-dom-perl 1.44-1 Perl module for building DOM Level ii libxml-libxml-perl 1.70.ds-1 Perl interface to the libxml2 libr ii libxml-libxslt-perl 1.70-1 Perl module for using the GNOME li ii libxml-parser-perl 2.36-1.1+b1 Perl module for parsing XML files ii libxml-twig-perl 1:3.34-1 Perl module for processing huge XM ii libxml-writer-perl 0.612-1 Perl module for writing XML docume ii libxmltv-perl 0.5.58-1 Perl libraries related to the XMLT ii perl [libcompress-zlib-p 5.10.1-16 Larry Wall's Practical Extraction ii perl-modules 5.10.1-16 Core Perl modules Versions of packages xmltv-util recommends: ii liblingua-preferred-perl 0.2.4-3 Perl module which allows language ii libterm-progressbar-perl 2.09-6 Perl module to print a progress ba ii libunicode-string-perl 2.09-3+b1 Perl modules for Unicode strings Versions of packages xmltv-util suggests: pn liblinux-dvb-perl <none> (no description available) ii liblog-tracemessages-perl 1.4-2 Perl module to allow for trace mes ii libtext-kakasi-perl 2.04-1+b1 KAKASI interface for scripting lan ii xmltv-gui 0.5.58-1 Graphical user interface related t -- debconf-show failed
diff -Naur xmltv-0.5.58/grab/fi/tv_grab_fi xmltv-0.5.58.new//grab/fi/tv_grab_fi --- xmltv-0.5.58/grab/fi/tv_grab_fi 2010-09-07 03:59:26.000000000 +0300 +++ xmltv-0.5.58.new//grab/fi/tv_grab_fi 2010-11-30 14:44:01.117767465 +0200 @@ -27,7 +27,7 @@ =head1 DESCRIPTION Output TV listings for several channels available in Finland. -The data comes from www.telkku.com. The grabber relies on parsing HTML +The data comes from www.telkku.com. The grabber relies on parsing HTML so it might stop working at any time. First run B<tv_grab_fi --configure> to choose, which channels you want @@ -86,9 +86,10 @@ # initializations use strict; -use XMLTV::Version '$Id: tv_grab_fi,v 1.56 2010/09/07 00:59:26 knowledgejunkie Exp $ '; +use XMLTV::Version '$Id: tv_grab_fi,v 1.57 2010/11/27 21:18:47 va1210 Exp $ '; use XMLTV::Capabilities qw/baseline manualconfig cache/; use XMLTV::Description 'Finland'; +use Encode; use Getopt::Long; use Date::Manip; use HTML::Entities; @@ -305,7 +306,7 @@ # they could be separate stages. # my $bar = new XMLTV::ProgressBar( { - name => 'getting listings', + name => 'getting listings', count => scalar @to_get, } ) if not $opt_quiet; foreach (@to_get) { @@ -336,14 +337,23 @@ my $warned_bad_chars; sub tidy( $ ) { - for (my $tmp = shift) { - tr/\t\205/ /d; - if (s/([^\012\015\040-\176\240-\377]+)//g) { - warn "removing bad characters: '$1'" - unless ($warned_bad_chars++ or $opt_quiet); - } - return $_; - } + my($string) = @_; + $string = decode_utf8($string); + + # Make contents ISO-8859-1 compatible + # TAB => space + # U+2013 (EN DASH) => - + # U+2019 (RIGHT SINGLE QUOTATION MARK) => ' + # U+201D (RIGHT DOUBLE QUOTATION MARK) => " + $string =~ tr/\t\N{U+2013}\N{U+2019}\N{U+201D}/ \-\'\"/; + + # Warn about incomplete replacement + if ($string =~ s/([^\N{U+0000}-\N{U+00FF}])//g) { + warn "removing bad character: 0x" . sprintf("%04x", ord($1)) + unless ($warned_bad_chars++ or $opt_quiet); + } + + return $string; } #### @@ -359,7 +369,7 @@ sub process_table { my ($date, $ch_xmltv_id, $ch_their_id) = @_; my $today = UnixDate($date, '%Y%m%d'); - my $url = "$SITE/telkku?tila=knvt&kan=$ch_their_id&p=$today"; + my $url = "$SITE/channel/list/$ch_their_id/$today"; t "getting URL: $url"; my $tree = get_nice_tree $url, \&tidy; local $SIG{__WARN__} = sub { @@ -423,28 +433,28 @@ my ($stop_base, $stop_tz) = @{date_to_local($stop, $TZ)}; t 'converted back to Finnish: ' . d [ $stop_base, $stop_tz ]; $prog{stop}=UnixDate($stop_base, '%q') . " $stop_tz"; - } + } # Check for series. # # Check 1: episode name of series in title. - # If title contains a colon (:), check to see if the string on the - # left-hand side of the colon has been defined as a series in the + # If title contains a colon (:), check to see if the string on the + # left-hand side of the colon has been defined as a series in the # conf-file. If it has, assume that the string on the left-hand side # of the colon is the name of the series, and the string on the - # right-hand side is the name of the episode. For example, if the + # right-hand side is the name of the episode. For example, if the # following line has been defined in the tv_grab_fi.conf-file: - # "series title Prisma", and the title of the program is - # "Prisma: Totuus tappajadinosauruksista", then the script will assume - # that the title of the program is actually "Prisma", and the episode + # "series title Prisma", and the title of the program is + # "Prisma: Totuus tappajadinosauruksista", then the script will assume + # that the title of the program is actually "Prisma", and the episode # name/sub-title is "Totuus tappajadinosauruksista". if (($cur->{title} =~ m/([^:]+):\s*(.*)/) && (exists $title{$1})) { my $new_title = $1; my $episode = $2; - t "series $new_title, episode title $episode"; + t "series $new_title, episode title $episode"; $prog{title}=[ [ $new_title, $LANG ] ]; - $prog{'sub-title'} = [ [ $episode, $LANG ] ]; + $prog{'sub-title'} = [ [ $episode, $LANG ] ]; } else { $prog{title}=[ [ $cur->{title}, $LANG ] ]; @@ -459,14 +469,14 @@ # and the description of the program is "Pingviinin paluu. Amerikkalainen # animaatiosarja. Outojen ryöstöjen sarja johdattaa Batmanin Pingviinin # jäljille.", then the script will assume that the episode name/sub-title - # is "Pingviinin paluu", and that the description is actually - # "Amerikkalainen animaatiosarja. Outojen ryöstöjen sarja johdattaa + # is "Pingviinin paluu", and that the description is actually + # "Amerikkalainen animaatiosarja. Outojen ryöstöjen sarja johdattaa # Batmanin Pingviinin jäljille." if ((defined $cur->{desc}) && (exists $description{$cur->{title}}) && ($cur->{desc} =~ s/^\s*([^.]+)\.\s*//)) { my $episode = $1; - t "series $cur->{title}, episode title $episode"; + t "series $cur->{title}, episode title $episode"; $prog{'sub-title'} = [ [ $episode, $LANG ] ]; # Make sure the description is not left empty @@ -491,30 +501,17 @@ } ##### -# All program info is contained within a table cell with the -# following properties: -# -# <td style="height: 500px; padding:5px 15px 5px 15px;" valign="top"> -# -# For each program there is a entry that looks as follows: -# -# <b>hh:mm Program Name</b><br />Description<br /><br /> -# -# After processing the html with get_nice_tree, the space and slash in the -# <br /> tags sometimes fall off (with TreeBuilder v. 3.21 and below). -# Also, all special characters (e.g. åäöÅÄÖ) are replaced with their -# corresponding html special characters. This means that the following -# program element -# -# <b>12.10 Tänään otsikoissa</b><br />Aamu-tv:n ajankohtaiset aiheet koosteena.<br /><br /> -# -# ends up looking like this (with TreeBuilder v. 3.21 and below): +# All program info is contained within a unsorted list with class "programList" # -# <b>12.10 Tänään otsikoissa</b><br>Aamu-tv:n ajankohtaiset aiheet koosteena.<br><br> -# -# or with TreeBuilder v. 3.22 and above: -# -# <b>12.10 Tänään otsikoissa</b><br />Aamu-tv:n ajankohtaiset aiheet koosteena.<br /><br /> +# </div> +# <ul class="programList"> +# <li> +# <span class="programDate"><a href="http://www.telkku.com/program/show/2010112621451">23:45 Uutisikkuna</a></span><br /> +# <span class="programDescription"></span> +# </li> +# ... +# </ul> +# <div ... # sub get_program_data { my $tree = shift; @@ -522,17 +519,18 @@ my @data; # Dump the html-tree to a string for matching my $html = $tree->as_HTML; - while ($html =~ m/popup\(this\.href\)">([0-9]{2})\.([0-9]{2}) (.+?)<\/a><br( \/)?>(.*?)<br( \/)?><br( \/)?>/g) { - # Use decode_entities() to convert html characters + while ($html =~ m,<li><span class="programDate"><a href="$SITE/program/show/\d+">(\d{2}:\d{2}) (.+?)</a></span><br(?: \/)?><span class="programDescription">(.*?)</span>,go) { + # Use decode_entities() to convert html characters # to ascii (e.g ä to ä) - my %keys = (time => $1 . ':' . $2, - title => decode_entities($3), - desc => decode_entities($5), - ); + my %keys = ( + time => $1, + title => decode_entities($2), + desc => decode_entities($3), + ); my %h; foreach my $k (keys %keys) { my $v = $keys{$k}; - # Only record entry if it isn't empty (actually time + # Only record entry if it isn't empty (actually time # and title are required, but we don't check that.) if (length ($v) > 0) { t "got a result from sub for $k: $v"; @@ -549,16 +547,13 @@ # get channel listing sub get_channels { my $bar = new XMLTV::ProgressBar({ - name => 'getting list of channels', + name => 'getting list of channels', count => 1, } ) if not $opt_quiet; - my %channels; - - # Channels are retrieved from the channel "Suosikkikanava" (149), - # to avoid listing it as a channel. - my $url="$SITE/telkku?tila=knvt&kan=149"; - my $tree = get_nice_tree $url; + my $url="$SITE/channel"; + t "getting URL: $url"; + my $tree = get_nice_tree($url, \&decode_utf8); # FIXME commonize this local $SIG{__WARN__} = sub { @@ -568,19 +563,36 @@ die "$url: $_[0]"; }; - # All channels are listed after the tag <h4>A-Z</h4> within <li>-tags + # All channels are listed in the left side bar within <li> tags + # + # <div id="channelContainer"> + # <div id="channelList"> + # <div id="channelListHeader"> + # <div id="channelListHeaderRight"></div> + # <div id="channelListHeaderLeft"></div> + # Kaikki kanavat + # </div> + # <ul> + # <li><a href="http://telkku.com/channel/list/1/20101127">TV1</a></li> + # ... + # </ul> + # </div> + # <div id="channelContent"> my $html = $tree->as_HTML; - $html =~ /<h3>A-Z<\/h3>\s*<ul>(.*)<\/ul>/; - my $trunc_html = $1; - while ($trunc_html =~ m/<li><a href=\"telkku\?tila=knvt&kan=([0-9]+)\">(.+?)<\/a>/g) { - my $channel_id = $1; - my $channel_name = $2; - $channels{$channel_id} = $channel_name; - push @ch_all, { 'display-name' => [ [ $channel_name, $LANG ] ], - 'id' => $channel_id }; - } + my($trunc_html) = $html =~ m,<div id="channelContainer">.*</div><ul>(.*)</ul></div><div id="channelContent">,; + die "can't find channel information" unless defined $trunc_html; + + my %channels = ($trunc_html =~ m,<li><a href="$SITE/channel/list/(\d+)/\d+">(.+?)</a>,g); die "no channels could be found" if not keys %channels; + + @ch_all = map { { + 'display-name' => [ [ $channels{$_}, $LANG ] ], + id => $_, + } } + sort { $a <=> $b } + keys %channels; + update $bar if not $opt_quiet; $bar->finish() if not $opt_quiet; return %channels;
signature.asc
Description: Digital signature