Bug#605467: xmltv-util: tv_grab_fi fails to get listings

Edward J. Shornock Tue, 30 Nov 2010 04:57:24 -0800

Package: xmltv-util
Version: 0.5.58-1
Severity: normal
Tags: upstream patch


Due to a recent change in telkku.com, listings for Finland cannot be
fetched. This was fixed upstream in CVS revision 1.57.

Attached is a patch against 0.5.58.


-- System Information:
Debian Release: squeeze/sid
  APT prefers testing
  APT policy: (990, 'testing'), (500, 'unstable'), (1, 'experimental')
Architecture: amd64 (x86_64)

Kernel: Linux 2.6.36.1+bfs+aufs2.1 (SMP w/2 CPU cores; PREEMPT)
Locale: LANG=fi_FI.UTF-8, LC_CTYPE=fi_FI.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash

Versions of packages xmltv-util depends on:
ii  libarchive-zip-perl      1.30-3          Perl module for manipulation of ZI
ii  libdate-manip-perl       6.11-1          module for manipulating dates
ii  libdatetime-format-strpt 1.5000-1        Perl module to parse and format st
ii  libhtml-parser-perl      3.66-1          collection of modules that parse H
ii  libhtml-tableextract-per 2.10-3          module for extracting the content 
ii  libhtml-tree-perl        3.23-2          Perl module to represent and creat
ii  libhttp-cache-transparen 1.0-1           Perl module used to transparently 
ii  libio-stringy-perl       2.110-4         Perl modules for IO from scalars a
ii  libparse-recdescent-perl 1.965001+dfsg-1 Perl module to create and use recu
ii  libsoap-lite-perl        0.712-2         Perl implementation of a SOAP clie
ii  libterm-readkey-perl     2.30-4          A perl module for simple terminal 
ii  libtext-bidi-perl        0.03-5          Unicode bidi algorithm for Perl us
ii  libtext-iconv-perl       1.7-2           converts between character sets in
ii  libwww-mechanize-perl    1.64-1          module to automate interaction wit
ii  libwww-perl              5.836-1         Perl HTTP/WWW client/server librar
ii  libxml-dom-perl          1.44-1          Perl module for building DOM Level
ii  libxml-libxml-perl       1.70.ds-1       Perl interface to the libxml2 libr
ii  libxml-libxslt-perl      1.70-1          Perl module for using the GNOME li
ii  libxml-parser-perl       2.36-1.1+b1     Perl module for parsing XML files
ii  libxml-twig-perl         1:3.34-1        Perl module for processing huge XM
ii  libxml-writer-perl       0.612-1         Perl module for writing XML docume
ii  libxmltv-perl            0.5.58-1        Perl libraries related to the XMLT
ii  perl [libcompress-zlib-p 5.10.1-16       Larry Wall's Practical Extraction 
ii  perl-modules             5.10.1-16       Core Perl modules

Versions of packages xmltv-util recommends:
ii  liblingua-preferred-perl      0.2.4-3    Perl module which allows language 
ii  libterm-progressbar-perl      2.09-6     Perl module to print a progress ba
ii  libunicode-string-perl        2.09-3+b1  Perl modules for Unicode strings

Versions of packages xmltv-util suggests:
pn  liblinux-dvb-perl             <none>     (no description available)
ii  liblog-tracemessages-perl     1.4-2      Perl module to allow for trace mes
ii  libtext-kakasi-perl           2.04-1+b1  KAKASI interface for scripting lan
ii  xmltv-gui                     0.5.58-1   Graphical user interface related t

-- debconf-show failed

diff -Naur xmltv-0.5.58/grab/fi/tv_grab_fi xmltv-0.5.58.new//grab/fi/tv_grab_fi
--- xmltv-0.5.58/grab/fi/tv_grab_fi	2010-09-07 03:59:26.000000000 +0300
+++ xmltv-0.5.58.new//grab/fi/tv_grab_fi	2010-11-30 14:44:01.117767465 +0200
@@ -27,7 +27,7 @@
 =head1 DESCRIPTION
 
 Output TV listings for several channels available in Finland.
-The data comes from www.telkku.com. The grabber relies on parsing HTML 
+The data comes from www.telkku.com. The grabber relies on parsing HTML
 so it might stop working at any time.
 
 First run B<tv_grab_fi --configure> to choose, which channels you want
@@ -86,9 +86,10 @@
 # initializations
 
 use strict;
-use XMLTV::Version '$Id: tv_grab_fi,v 1.56 2010/09/07 00:59:26 knowledgejunkie Exp $ ';
+use XMLTV::Version '$Id: tv_grab_fi,v 1.57 2010/11/27 21:18:47 va1210 Exp $ ';
 use XMLTV::Capabilities qw/baseline manualconfig cache/;
 use XMLTV::Description 'Finland';
+use Encode;
 use Getopt::Long;
 use Date::Manip;
 use HTML::Entities;
@@ -305,7 +306,7 @@
 # they could be separate stages.
 #
 my $bar = new XMLTV::ProgressBar( {
-   name => 'getting listings', 
+   name => 'getting listings',
    count => scalar @to_get,
  } ) if not $opt_quiet;
 foreach (@to_get) {
@@ -336,14 +337,23 @@
 
 my $warned_bad_chars;
 sub tidy( $ ) {
-    for (my $tmp = shift) {
-	tr/\t\205/ /d;
-	if (s/([^\012\015\040-\176\240-\377]+)//g) {
-	    warn "removing bad characters: '$1'"
-	      unless ($warned_bad_chars++ or $opt_quiet);
-	}
-	return $_;
-    }
+  my($string) = @_;
+  $string = decode_utf8($string);
+
+  # Make contents ISO-8859-1 compatible
+  # TAB                                  => space
+  # U+2013 (EN DASH)                     => -
+  # U+2019 (RIGHT SINGLE QUOTATION MARK) => '
+  # U+201D (RIGHT DOUBLE QUOTATION MARK) => "
+  $string =~ tr/\t\N{U+2013}\N{U+2019}\N{U+201D}/ \-\'\"/;
+
+  # Warn about incomplete replacement
+  if ($string =~ s/([^\N{U+0000}-\N{U+00FF}])//g) {
+    warn "removing bad character: 0x" . sprintf("%04x", ord($1))
+      unless ($warned_bad_chars++ or $opt_quiet);
+  }
+
+  return $string;
 }
 
 ####
@@ -359,7 +369,7 @@
 sub process_table {
     my ($date, $ch_xmltv_id, $ch_their_id) = @_;
     my $today = UnixDate($date, '%Y%m%d');
-    my $url = "$SITE/telkku?tila=knvt&kan=$ch_their_id&p=$today";
+    my $url = "$SITE/channel/list/$ch_their_id/$today";
     t "getting URL: $url";
     my $tree = get_nice_tree $url, \&tidy;
     local $SIG{__WARN__} = sub {
@@ -423,28 +433,28 @@
 	my ($stop_base, $stop_tz) = @{date_to_local($stop, $TZ)};
 	t 'converted back to Finnish: ' . d [ $stop_base, $stop_tz ];
 	$prog{stop}=UnixDate($stop_base, '%q') . " $stop_tz";
-    }	
+    }
 
     # Check for series.
     #
     # Check 1: episode name of series in title.
-    # If title contains a colon (:), check to see if the string on the 
-    # left-hand side of the colon has been defined as a series in the 
+    # If title contains a colon (:), check to see if the string on the
+    # left-hand side of the colon has been defined as a series in the
     # conf-file. If it has, assume that the string on the left-hand side
     # of the colon is the name of the series, and the string on the
-    # right-hand side is the name of the episode. For example, if the 
+    # right-hand side is the name of the episode. For example, if the
     # following line has been defined in the tv_grab_fi.conf-file:
-    # "series title Prisma", and the title of the program is 
-    # "Prisma: Totuus tappajadinosauruksista", then the script will assume 
-    # that the title of the program is actually "Prisma", and the episode 
+    # "series title Prisma", and the title of the program is
+    # "Prisma: Totuus tappajadinosauruksista", then the script will assume
+    # that the title of the program is actually "Prisma", and the episode
     # name/sub-title is "Totuus tappajadinosauruksista".
     if (($cur->{title} =~ m/([^:]+):\s*(.*)/) &&
 	(exists $title{$1})) {
 	my $new_title = $1;
 	my $episode = $2;
-	t "series $new_title, episode title $episode"; 
+	t "series $new_title, episode title $episode";
 	$prog{title}=[ [ $new_title, $LANG ] ];
-	$prog{'sub-title'} = [ [ $episode, $LANG ] ];	
+	$prog{'sub-title'} = [ [ $episode, $LANG ] ];
     }
     else {
 	$prog{title}=[ [ $cur->{title}, $LANG ] ];
@@ -459,14 +469,14 @@
     # and the description of the program is "Pingviinin paluu. Amerikkalainen
     # animaatiosarja. Outojen ryöstöjen sarja johdattaa Batmanin Pingviinin
     # jäljille.", then the script will assume that the episode name/sub-title
-    # is "Pingviinin paluu", and that the description is actually 
-    # "Amerikkalainen animaatiosarja. Outojen ryöstöjen sarja johdattaa 
+    # is "Pingviinin paluu", and that the description is actually
+    # "Amerikkalainen animaatiosarja. Outojen ryöstöjen sarja johdattaa
     # Batmanin Pingviinin jäljille."
     if ((defined $cur->{desc}) &&
 	(exists $description{$cur->{title}})   &&
 	($cur->{desc} =~ s/^\s*([^.]+)\.\s*//)) {
 	my $episode = $1;
-	t "series $cur->{title}, episode title $episode"; 
+	t "series $cur->{title}, episode title $episode";
 	$prog{'sub-title'} = [ [ $episode, $LANG ] ];
 
 	# Make sure the description is not left empty
@@ -491,30 +501,17 @@
 }
 
 #####
-# All program info is contained within a table cell with the
-# following properties:
-#
-# <td style="height: 500px; padding:5px 15px 5px 15px;" valign="top">
-#
-# For each program there is a entry that looks as follows:
-#
-# <b>hh:mm Program Name</b><br />Description<br /><br />
-# 
-# After processing the html with get_nice_tree, the space and slash in the 
-# <br /> tags sometimes fall off (with TreeBuilder v. 3.21 and below).
-# Also, all special characters (e.g. åäöÅÄÖ) are replaced with their 
-# corresponding html special characters. This means that the following 
-# program element
-#
-# <b>12.10 Tänään otsikoissa</b><br />Aamu-tv:n ajankohtaiset aiheet koosteena.<br /><br />
-# 
-# ends up looking like this (with TreeBuilder v. 3.21 and below):
+# All program info is contained within a unsorted list with class "programList"
 #
-# <b>12.10 T&auml;n&auml;&auml;n otsikoissa</b><br>Aamu-tv:n ajankohtaiset aiheet koosteena.<br><br>
-#
-# or with TreeBuilder v. 3.22 and above:
-#
-# <b>12.10 T&auml;n&auml;&auml;n otsikoissa</b><br />Aamu-tv:n ajankohtaiset aiheet koosteena.<br /><br />
+# </div>
+#  <ul class="programList">
+#   <li>
+#    <span class="programDate"><a href="http://www.telkku.com/program/show/2010112621451";>23:45&nbsp;Uutisikkuna</a></span><br />
+#    <span class="programDescription"></span>
+#   </li>
+#   ...
+#  </ul>
+#  <div ...
 #
 sub get_program_data {
     my $tree = shift;
@@ -522,17 +519,18 @@
     my @data;
     # Dump the html-tree to a string for matching
     my $html = $tree->as_HTML;
-    while ($html =~ m/popup\(this\.href\)">([0-9]{2})\.([0-9]{2}) (.+?)<\/a><br( \/)?>(.*?)<br( \/)?><br( \/)?>/g) {
-	# Use decode_entities() to convert html characters 
+    while ($html =~ m,<li><span class="programDate"><a href="$SITE/program/show/\d+">(\d{2}:\d{2})&nbsp;(.+?)</a></span><br(?: \/)?><span class="programDescription">(.*?)</span>,go) {
+	# Use decode_entities() to convert html characters
 	# to ascii (e.g &auml; to ä)
-	my %keys = (time => $1 . ':' . $2,
-		    title => decode_entities($3),
-		    desc => decode_entities($5),
-		    );
+	my %keys = (
+		    time  => $1,
+		    title => decode_entities($2),
+		    desc  => decode_entities($3),
+		   );
 	my %h;
 	foreach my $k (keys %keys) {
 	    my $v = $keys{$k};
-	    # Only record entry if it isn't empty (actually time 
+	    # Only record entry if it isn't empty (actually time
 	    # and title are required, but we don't check that.)
 	    if (length ($v) > 0) {
 		t "got a result from sub for $k: $v";
@@ -549,16 +547,13 @@
 # get channel listing
 sub get_channels {
     my $bar = new XMLTV::ProgressBar({
-       name => 'getting list of channels', 
+       name => 'getting list of channels',
        count => 1,
     } ) if not $opt_quiet;
-    my %channels;
-
-    # Channels are retrieved from the channel "Suosikkikanava" (149),
-    # to avoid listing it as a channel.
 
-    my $url="$SITE/telkku?tila=knvt&kan=149";
-    my $tree = get_nice_tree $url;
+    my $url="$SITE/channel";
+    t "getting URL: $url";
+    my $tree = get_nice_tree($url, \&decode_utf8);
 
     # FIXME commonize this
     local $SIG{__WARN__} = sub {
@@ -568,19 +563,36 @@
 	die "$url: $_[0]";
     };
 
-    # All channels are listed after the tag <h4>A-Z</h4> within <li>-tags
+    # All channels are listed in the left side bar within <li> tags
+    #
+    # <div id="channelContainer">
+    #  <div id="channelList">
+    #   <div id="channelListHeader">
+    #	 <div id="channelListHeaderRight"></div>
+    #    <div id="channelListHeaderLeft"></div>
+    #    Kaikki kanavat
+    #   </div>
+    #   <ul>
+    #    <li><a href="http://telkku.com/channel/list/1/20101127";>TV1</a></li>
+    #    ...
+    #   </ul>
+    #  </div>
+    #  <div id="channelContent">
 
     my $html = $tree->as_HTML;
-    $html =~ /<h3>A-Z<\/h3>\s*<ul>(.*)<\/ul>/;
-    my $trunc_html = $1;
-    while ($trunc_html =~ m/<li><a href=\"telkku\?tila=knvt&amp;kan=([0-9]+)\">(.+?)<\/a>/g) {
-	my $channel_id = $1;
-	my $channel_name = $2;
-	$channels{$channel_id} = $channel_name;
-	push @ch_all, { 'display-name' => [ [ $channel_name, $LANG ] ],
-			'id' => $channel_id };    
-    }
+    my($trunc_html) = $html =~ m,<div id="channelContainer">.*</div><ul>(.*)</ul></div><div id="channelContent">,;
+    die "can't find channel information" unless defined $trunc_html;
+
+    my %channels = ($trunc_html =~ m,<li><a href="$SITE/channel/list/(\d+)/\d+">(.+?)</a>,g);
     die "no channels could be found" if not keys %channels;
+
+    @ch_all = map { {
+		      'display-name' => [ [ $channels{$_}, $LANG ] ],
+		      id             => $_,
+		    } }
+              sort { $a <=> $b }
+	      keys %channels;
+
     update $bar if not $opt_quiet;
     $bar->finish() if not $opt_quiet;
     return %channels;

signature.asc
Description: Digital signature

Bug#605467: xmltv-util: tv_grab_fi fails to get listings

Reply via email to