On Mon, Feb 22, 2010 at 10:46 PM, Stefan Behnel <stefan...@behnel.de> wrote:
> sharifah ummu kulthum, 22.02.2010 14:24: > > File "grabmy.py", line 63, in get_html > > return BeautifulSoup(content) > > File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1499, in > __init__ > > File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1230, in > __init__ > > File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1263, in _feed > > File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed > > self.goahead(0) > > File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead > > k = self.parse_starttag(i) > > File "/usr/lib/python2.6/HTMLParser.py", line 226, in parse_starttag > > endpos = self.check_for_whole_start_tag(i) > > File "/usr/lib/python2.6/HTMLParser.py", line 301, in > > check_for_whole_start_tag > > self.error("malformed start tag") > > File "/usr/lib/python2.6/HTMLParser.py", line 115, in error > > raise HTMLParseError(message, self.getpos()) > > HTMLParser.HTMLParseError: malformed start tag, at line 830, column 36 > > Just noticed this now - you seem to be using BeautifulSoup, likely version > 3.1. This version does not support parsing broken HTML any well, so use > version 3.0.8 instead, or switch to the tools I indicated. > > Note that switching tools means that you need to change your code to use > them. Just installing them is not enough. > > Stefan > > I am so sorry but I really don't know how to change the code as I have just learn python. How am I going to switch the version or to change the code? Because I don't really understand the code. Here is the code: ''' Copyright (c) 2008 Yap Sok Ann <sa...@sayap.com> This module contains xmltv grabbers for Malaysia channels. ''' __author__ = 'Yap Sok Ann <sa...@sayap.com>' __license__ = 'PSF License' import logging from datetime import date as dt from datetime import datetime, time, timedelta from dateutil.tz import tzlocal from httplib2 import Http from lxml import etree from urllib import urlencode from BeautifulSoup import BeautifulSoup channels = ['rtm1', 'rtm2', 'tv3', 'ntv7', '8tv', 'tv9'] datetime_format = '%Y%m%d%H%M%S %z' h = Http() h.force_exception_to_status_code = True #h.timeout = 15 logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(process)d %(message)s', ) log = logging.getLogger(__name__) def strclean(s): s = s.strip().replace('‘', '\'').replace('’', '\'') if s != ' ': return s class Grabber(object): base_url = None def __init__(self, channel): self.channel = channel self.url = self.base_url def qs_params(self, date, **kwargs): '''Returns a dict of params to form the url's query string ''' raise NotImplementedError def _parse_html(self, date, html): '''Returns a list of dicts with the following keys: - mandatory: title, start - optional: stop, sub_title, desc, episode_number, episode_system ''' raise NotImplementedError def get_html(self, date, **kwargs): params = self.qs_params(date, **kwargs) response, content = h.request(self.url + '?' + urlencode(params)) if response.status == 200: return BeautifulSoup(content) else: log.error('Status: %s\nContent: %s' % (response.status, content)) def parse_html(self, date, html): prev_schedule = None try: for schedule in self._parse_html(date, html): if 'stop' in schedule: yield schedule elif prev_schedule: prev_schedule['stop'] = schedule['start'] yield prev_schedule prev_schedule = schedule except: log.exception('Cannot parse html for date %s' % date) def to_xml(self, schedules): for schedule in schedules: program = etree.Element('programme', channel=self.channel, start=schedule['start'].strftime(datetime_format), stop=schedule['stop'].strftime(datetime_format)) title = etree.SubElement(program, 'title') title.text = schedule['title'] if schedule.get('episode_num'): episode_num = etree.SubElement(program, 'episode-num') episode_num.set('system', schedule.get('episode_system')) episode_num.text = schedule['episode_num'] for field in ['sub_title', 'desc']: if schedule.get(field): elem = etree.SubElement(program, field.replace('_', '-')) elem.text = schedule[field] yield program def grab(self, date, **kwargs): html = self.get_html(date, **kwargs) if html: return self.to_xml(self.parse_html(date, html)) class Astro(Grabber): base_url = 'http://www.astro.com.my/channels/%(channel)s/Default.asp' params_dicts = [dict(batch=1), dict(batch=2)] ignores = ['No Transmission', 'Transmission Ends'] def __init__(self, channel): self.channel = channel self.url = self.base_url % dict(channel=channel) def qs_params(self, date, **kwargs): kwargs['sDate'] = date.strftime('%d-%b-%Y') return kwargs def _parse_html(self, date, html): header_row = html.find('tr', bgcolor='#29487F') for tr in header_row.fetchNextSiblings('tr'): tds = tr.findChildren('td') title = strclean(tds[1].find('a').string) if title in self.ignores: continue # start time, '21:00' -> 9 PM hour, minute = [int(x) for x in tds[0].string.split(':')] start = datetime.combine(date, time(hour, minute, tzinfo=tzlocal())) # duration, '00:30' -> 30 minutes hours, minutes = [int(x) for x in tds[2].string.split(':')] stop = start + timedelta(hours=hours, minutes=minutes) yield dict(title=title, start=start, stop=stop) class TheStar(Grabber): base_url = 'http://star-ecentral.com/tvnradio/tvguide/guide.asp' params_dicts = [dict(db='live')] def qs_params(self, date, **kwargs): kwargs['pdate'] = date.strftime('%m/%d/%Y') kwargs['chn'] = self.channel.replace('rtm', 'tv') return kwargs def _parse_html(self, date, html): last_ampm = None header_row = html.find('tr', bgcolor='#5e789c') for tr in header_row.fetchNextSiblings('tr'): tds = tr.findChildren('td') schedule = {} schedule['title'] = strclean(tds[1].find('b').find('font').string) schedule['desc'] = strclean(tds[2].find('font').string) episode_num = strclean(tds[3].find('font').string) if episode_num: try: episode_num = int(episode_num) - 1 episode_num = '.' + str(episode_num) + '.' episode_system = 'xmltv_ns' except ValueError: episode_system = 'onscreen' schedule['episode_num'] = episode_num schedule['episode_system'] = episode_system # start time, '9.00pm' -> 9 PM time_str = tds[0].find('font').string ampm = time_str[-2:] hour, minute = [int(x) for x in time_str[:-2].split('.')] if ampm == 'pm' and hour < 12: hour += 12 elif ampm =='am' and hour == 12: hour = 0 if last_ampm == 'pm' and ampm == 'am': date = date + timedelta(1) schedule['start'] = datetime.combine( date, time(hour, minute, tzinfo=tzlocal())) last_ampm = ampm yield schedule def main(): from optparse import OptionParser parser = OptionParser() parser.add_option('-s', '--source', dest='source', help='SOURCE to grab from: Astro, TheStar. Default: TheStar') parser.add_option('-d', '--date', dest='date', help='Start DATE to grab schedules for (YYYY-MM-DD). Default: today') parser.add_option('-n', '--days', dest='days', help='Number of DAYS to grab schedules for. Default: 1') parser.add_option('-f', '--file', dest='filename', metavar='FILE', help='Output FILE to write to. Default: stdout') options, args = parser.parse_args() if options.source is None: cls = TheStar else: cls = globals()[options.source] if options.date is None: date = dt.today() else: date = dt(*[int(x) for x in options.date.split('-')]) if options.days is None: days = 1 else: days = int(options.days) root = etree.Element('tv') for channel in channels: grabber = cls(channel) for i in range(days): for params_dict in cls.params_dicts: for elem in grabber.grab(date + timedelta(i), **params_dict): root.append(elem) xml = etree.tostring(root, encoding='UTF-8', xml_declaration=True, pretty_print=True) if options.filename is None: print xml else: open(options.filename, 'w').write(xml) if __name__ == '__main__': main()
_______________________________________________ XML-SIG maillist - XML-SIG@python.org http://mail.python.org/mailman/listinfo/xml-sig