Author: dmeyer
Date: Fri Mar 23 12:50:17 2007
New Revision: 2583

Modified:
   trunk/epg/AUTHORS
   trunk/epg/src/sources/xmltv.py

Log:
sax based xmltv parser by tanja

Modified: trunk/epg/AUTHORS
==============================================================================
--- trunk/epg/AUTHORS   (original)
+++ trunk/epg/AUTHORS   Fri Mar 23 12:50:17 2007
@@ -10,4 +10,4 @@
 Contributions:
 
 Tanja Kotthaus <[EMAIL PROTECTED]>
-epgdata.com parser
+epgdata.com parser, SAX based XMLTV parser

Modified: trunk/epg/src/sources/xmltv.py
==============================================================================
--- trunk/epg/src/sources/xmltv.py      (original)
+++ trunk/epg/src/sources/xmltv.py      Fri Mar 23 12:50:17 2007
@@ -29,17 +29,18 @@
 
 __all__ = [ 'config', 'update' ]
 
-# Python imports
-import time
+# python imports
 import os
+import time
 import calendar
-import shutil
 import logging
 
+import xml.sax
+import xml.sax.saxutils
+
 # kaa imports
 from kaa import TEMP
 import kaa.notifier
-import kaa.xml
 
 from config_xmltv import config
 
@@ -47,43 +48,151 @@
 log = logging.getLogger('xmltv')
 
 
-def timestr2secs_utc(timestr):
+class XmltvParser(object):
     """
-    Convert a timestring to UTC (=GMT) seconds.
-
-    The format is either one of these two:
-    '20020702100000 CDT'
-    '200209080000 +0100'
+    Parser class for xmltv files
     """
-    # This is either something like 'EDT', or '+1'
-    try:
-        tval, tz = timestr.split()
-    except ValueError:
-        tval = timestr
-        tz   = str(-time.timezone/3600)
-
-    if tz == 'CET':
-        tz='+1'
-
-    # Is it the '+1' format?
-    if tz[0] == '+' or tz[0] == '-':
-        tmTuple = ( int(tval[0:4]), int(tval[4:6]), int(tval[6:8]),
-                    int(tval[8:10]), int(tval[10:12]), 0, -1, -1, -1 )
-        secs = calendar.timegm( tmTuple )
 
-        adj_neg = int(tz) >= 0
+    mapping = {
+            'title':'title',
+            'sub-title':'subtitle',
+            'episode-num':'episode',
+            'category':'genre',
+            'desc':'desc',
+            'date':'date'
+            }
+
+    channels = {}
+
+
+    def parse(self, filename):
+        """
+        Create a sax parser and parse the file
+        """
+
+        # Create a parser
+        parser = xml.sax.make_parser()
+        # ignore external dtd file
+        parser.setFeature(xml.sax.handler.feature_external_ges, False)
+
+        # create a handler
+        dh = xml.sax.ContentHandler()
+        dh.startElement = self.startElement
+        dh.endElement = self.endElement
+        dh.characters = self.characters
+
+        # Tell the parser to use our handler
+        parser.setContentHandler(dh)
+
+        self._dict = None
+        self._current = None
+        self._characters = ''
+        # parse the input
+        parser.parse('file://' + filename)
+
+
+    def error(self, exception):
+        log.exception(exception)
+
+
+    def startElement(self, name, attrs):
+        """
+        startElement function for SAX.
+
+        This will be called whenever we enter an element during parsing.
+        Then the attributes will be extracted.
+        """
+        if name == 'channel':
+            # extract attribute "id"
+            self._dict = {}
+            self._dict['channel_id'] = attrs.get('id', None)
+            self._dict['display-name'] = []
+        elif name == 'display-name':
+            # unfortunately there might be more than one for each channel
+            self._dict[name].append(u'')
+            self._current = name
+        elif name == 'programme':
+            self._dict = {}
+            # extract "start", "stop" and "id" from attributes
+            start = attrs.get('start',None)
+            self._dict['start'] = start
+            stop = attrs.get('stop',None)
+            self._dict['stop'] = stop
+            self._dict['channel_id'] = attrs.get('channel',None)
+        elif name in self.mapping:
+            # translate element name using self.mapping
+            name = self.mapping[name]
+            # start an empty string for the content of this element
+            self._dict[name] = u''
+            # and store the name of the current element
+            self._current = name
+
+
+    def characters(self, ch):
+        """
+        characters function for SAX
+        """
+        if self._dict is not None and self._current:
+            if self._current == 'display-name':
+                # there might be more than one display-name
+                self._dict['display-name'][-1] +=ch
+            else:
+                self._dict[self._current] += ch
+
+
+    def endElement(self, name):
+        """
+        endElement function for SAX
+        """
+        if name == 'channel':
+            # fill channel info to database
+            self.handle_channel(self._dict)
+            self._dict = None
+        elif name == 'programme':
+            # fill programme info to database
+            self.handle_programme(self._dict)
+            self._dict = None
+        # in any case:
+        self._current = None
+
+
+    def timestr2secs_utc(self, timestr):
+        """
+        Convert a timestring to UTC (=GMT) seconds.
+
+        The format is either one of these two:
+        '20020702100000 CDT'
+        '200209080000 +0100'
+        """
+        # This is either something like 'EDT', or '+1'
         try:
-            min = int(tz[3:5])
+            tval, tz = timestr.split()
         except ValueError:
-            # sometimes the mins are missing :-(
-            min = 0
-        adj_secs = int(tz[1:3])*3600+ min*60
+            tval = timestr
+            tz   = str(-time.timezone/3600)
+
+        if tz == 'CET':
+            tz='+1'
 
-        if adj_neg:
-            secs -= adj_secs
+        # Is it the '+1' format?
+        if tz[0] == '+' or tz[0] == '-':
+            tmTuple = ( int(tval[0:4]), int(tval[4:6]), int(tval[6:8]),
+                        int(tval[8:10]), int(tval[10:12]), 0, -1, -1, -1 )
+            secs = calendar.timegm( tmTuple )
+
+            adj_neg = int(tz) >= 0
+            try:
+                min = int(tz[3:5])
+            except ValueError:
+                # sometimes the mins are missing :-(
+                min = 0
+            adj_secs = int(tz[1:3])*3600+ min*60
+
+            if adj_neg:
+                secs -= adj_secs
+            else:
+                secs += adj_secs
         else:
-            secs += adj_secs
-    else:
         # No, use the regular conversion
 
         ## WARNING! BUG HERE!
@@ -91,110 +200,101 @@
         # handle time zones. There is no obvious function that does. Therefore
         # this bug is left in for someone else to solve.
 
-        try:
-            secs = time.mktime(strptime.strptime(timestr, xmltv.date_format))
-        except ValueError:
-            timestr = timestr.replace('EST', '')
-            secs = time.mktime(strptime.strptime(timestr, xmltv.date_format))
-    return secs
-
-
-
-def parse_channel(info):
-    """
-    Parse channel information
-    """
-    channel_id = info.node.getattr('id')
-    channel = station = name = display = None
-
-    for child in info.node:
-        # This logic expects that the first display-name that appears
-        # after an all-numeric and an all-alpha display-name is going
-        # to be the descriptive station name.  XXX: check if this holds
-        # for all xmltv source.
-        if child.name == "display-name":
-            if not channel and child.content.isdigit():
-                channel = child.content
-            elif not station and child.content.isalpha():
-                station = child.content
+            try:
+                secs = time.mktime(time.strptime(timestr,'%Y-%m-%d %H:%M:%S'))
+            except ValueError:
+                timestr = timestr.replace('EST', '')
+                secs = time.mktime(time.strptime(timestr,'%Y-%m-%d %H:%M:%S'))
+        return float(secs)
+
+
+    def handle_channel(self, attr):
+        """
+        put the channel info to the database
+        """
+        channel = station = name = display = None
+        channel_id = attr['channel_id']
+
+        while len(attr['display-name'])>0:
+            # This logic expects that the first display-name that appears
+            # after an all-numeric and an all-alpha display-name is going
+            # to be the descriptive station name.  XXX: check if this holds
+            # for all xmltv source.
+            content = attr['display-name'].pop(0)
+            if not channel and content.isdigit():
+                channel = content
+            elif not station and content.isalpha():
+                station = content
             elif channel and station and not name:
-                name = child.content
+                name = content
             else:
-                # something else, just remeber it in case we
+                # something else, just remember it in case we
                 # don't have a name later
-                display = child.content
+                display = content
 
-    if not name:
-        # set name to something. XXX: this is needed for the german xmltv
-        # stuff, maybe others work different. Maybe check the <tv> tag
-        # for the used grabber somehow.
-        name = display or station
-
-    db_id = info.add_channel(tuner_id=channel, name=station, long_name=name)
-    info.channel_id_to_db_id[channel_id] = [db_id, None]
-
-
-# mapping for xmltv -> epgdb
-ATTR_MAPPING = {
-    'desc': 'desc',
-    'sub-title': 'subtitle',
-    'episode-num': 'episode',
-    'category': 'genre' }
+        if not name:
+            # set name to something. XXX: this is needed for the german xmltv
+            # stuff, maybe others work different. Maybe check the <tv> tag
+            # for the used grabber somehow.
+            name = display or station
+
+
+            db_id = self.add_channel(tuner_id=channel,
+                                     name=station,
+                                     long_name=name)
+            self.channels[attr['channel_id']] = [db_id, None]
+
+
+    def handle_programme(self, attr):
+        """
+        put the programme info to the database
+        """
+        # first check the channel_id
+        channel_id = attr.pop('channel_id')
+        if channel_id not in self.channels:
+            log.warning("Program exists for unknown channel '%s'" % channel_id)
+            return
 
-def parse_programme(info):
-    """
-    Parse a program node.
-    """
-    channel_id = info.node.getattr('channel')
-    if channel_id not in info.channel_id_to_db_id:
-        log.warning("Program exists for unknown channel '%s'" % channel_id)
-        return
-
-    title = None
-    attr = {}
-
-    for child in info.node.children:
-        if child.name == "title":
-            title = child.content
-        elif child.name == "date":
+        # then there should of course be a title
+        title = attr.pop('title')
+
+        # the date element should be a integer
+        try:
+            date = attr.pop('date')
             fmt = "%Y-%m-%d"
-            if len(child.content) == 4:
+            if len(date) == 4:
                 fmt = "%Y"
-            attr['date'] = int(time.mktime(time.strptime(child.content, fmt)))
-        elif child.name in ATTR_MAPPING.keys():
-            attr[ATTR_MAPPING[child.name]] = child.content
-
-    if not title:
-        return
-
-    start = timestr2secs_utc(info.node.getattr("start"))
-    db_id, last_prog = info.channel_id_to_db_id[channel_id]
-    if last_prog:
-        # There is a previous program for this channel with no stop time,
-        # so set last program stop time to this program start time.
-        # XXX This only works in sorted files. I guess it is ok to force the
-        # user to run tv_sort to fix this. And IIRC tv_sort also takes care of
-        # this problem.
-        last_start, last_title, last_attr = last_prog
-        info.add_program(db_id, last_start, start, last_title, **last_attr)
-    if not info.node.getattr("stop"):
-        info.channel_id_to_db_id[channel_id][1] = (start, title, attr)
-    else:
-        stop = timestr2secs_utc(info.node.getattr("stop"))
-        info.add_program(db_id, start, stop, title, **attr)
-
-
-class UpdateInfo:
-    """
-    Simple class holding information we need for update information.
-    """
-    pass
+            attr['date'] = int(time.mktime(time.strptime(date, fmt)))
+        except KeyError:
+            pass
+
+        # then the start time
+        start = self.timestr2secs_utc(attr.pop('start'))
+
+        # stop time is more complicated, as it is not always given
+        db_id, last_prog = self.channels[channel_id]
+        if last_prog:
+            # There is a previous program for this channel with no stop time,
+            # so set last program stop time to this program start time.
+            # XXX This only works in sorted files. I guess it is ok to force 
the
+            # user to run tv_sort to fix this. And IIRC tv_sort also takes 
care of
+            # this problem.
+            last_start, last_title, last_attr = last_prog
+            self.add_program(db_id, last_start, start, last_title, **last_attr)
+            self.channels[channel_id][1] = None
+        try:
+            stop = self.timestr2secs_utc(attr.pop('stop'))
+            # we have all info, let's fill it to the database
+            self.add_program(db_id, start, stop, title, **attr)
+        except:
+            # there is not stop time for this
+            self.channels[channel_id][1] = (start, title, attr)
 
 
 @kaa.notifier.execute_in_thread('epg')
-def _parse_xml():
+def update(epg):
     """
-    Thread to parse the xml file. It will also call the grabber if needed.
+    Interface to source_xmltv.
     """
     if config.grabber:
         log.info('grabbing listings using %s', config.grabber)
@@ -226,61 +326,11 @@
         xmltv_file = config.data_file
 
     # Now we have a xmltv file and need to parse it
-    log.info('parse xml file')
-    try:
-        doc = kaa.xml.Document(xmltv_file, 'tv')
-    except:
-        log.exception('error parsing xmltv file')
-        return
-
-    channel_id_to_db_id = {}
-    nprograms = 0
-
-    for child in doc:
-        if child.name == "programme":
-            nprograms += 1
-
-    info = UpdateInfo()
-    info.doc = doc
-    info.node = doc.first
-    info.channel_id_to_db_id = channel_id_to_db_id
-    info.total = nprograms
-    info.progress_step = info.total / 100
-
-    return info
-
-
[EMAIL PROTECTED]()
-def update(epg):
-    """
-    Interface to source_xmltv.
-    """
-    if not config.data_file and not config.grabber:
-        log.error('XMLTV gabber not configured.')
-        yield False
-    # _parse_xml is forced to be executed in a thread. This means that
-    # it always returns an InProgress object that needs to be yielded.
-    # When yield returns we need to call the InProgress object to get
-    # the result. If the result is None, the thread run into an error.
-    info = _parse_xml()
-    yield info
-    info = info()
-    if not info:
-        yield False
-
-    info.add_program = epg.add_program
-    info.add_channel = epg.add_channel
-    t0 = time.time()
-    while info.node:
-        if info.node.name == "channel":
-            parse_channel(info)
-        if info.node.name == "programme":
-            parse_programme(info)
-
-        info.node = info.node.get_next()
-        if time.time() - t0 > 0.1:
-            # time to return to the main loop
-            yield kaa.notifier.YieldContinue
-            t0 = time.time()
+    log.info('parse xmltv file %s' % xmltv_file)
+    parser = XmltvParser()
+    parser.add_channel = epg.add_channel
+    parser.add_program = epg.add_program
+    parser.parse(xmltv_file)
 
-    yield True
+    epg.add_program_wait()
+    return True

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Freevo-cvslog mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/freevo-cvslog

Reply via email to