Author: duncan
Date: Sun Nov  5 18:32:04 2006
New Revision: 8539

Modified:
   branches/rel-1/freevo/ChangeLog
   branches/rel-1/freevo/setup.py
   branches/rel-1/freevo/src/util/fxdimdb.py

Log:
[ 1590928 ] Remove regex parsing of html for imdb
First attempt, seems to work for a small sample


Modified: branches/rel-1/freevo/ChangeLog
==============================================================================
--- branches/rel-1/freevo/ChangeLog     (original)
+++ branches/rel-1/freevo/ChangeLog     Sun Nov  5 18:32:04 2006
@@ -21,12 +21,14 @@
  * Replaced mmpython with kaa.base and kaa.metadata (F#1580712)
  * Updated DEBUGGING, IP, PORT, UID and GUI based on helper name (F#1580628)
  * Updated helper convert_config (F#1578183)
+ * Updated imdb movie parsing to use BeautifulSoup (F#1590928)
 
 
 Release 1.6.1 (2006-??-??):
 ---------------------------
 
  * Updated translations for new plug-ins and skins (F#1587980)
+ * Fixed cache problem when a play list file is corrupt (B#1589913)
  * Fixed detached music player when the plug-in mplayervis is enabled 
(B#1582048)
  * Fixed IMDB parser, the title should not work (B#1586579)
  * Fixed VIDEO_INTERLACING = False having no effect (B#1586721)

Modified: branches/rel-1/freevo/setup.py
==============================================================================
--- branches/rel-1/freevo/setup.py      (original)
+++ branches/rel-1/freevo/setup.py      Sun Nov  5 18:32:04 2006
@@ -20,6 +20,7 @@
             ('xml.utils.qp_xml', 'http://pyxml.sourceforge.net/'),
             ('kaa', 'http://sourceforge.net/projects/kaa' ),
             ('kaa.metadata', 'http://sourceforge.net/projects/kaa' ),
+            ('BeautifulSoup', 'http://www.crummy.com/software/BeautifulSoup/' 
),
             ('pygame', 'http://www.pygame.org'),
             ('Image', 'http://www.pythonware.com/products/pil/'),
             ('elementtree', 'http://effbot.org/zone/elementtree.htm'),

Modified: branches/rel-1/freevo/src/util/fxdimdb.py
==============================================================================
--- branches/rel-1/freevo/src/util/fxdimdb.py   (original)
+++ branches/rel-1/freevo/src/util/fxdimdb.py   Sun Nov  5 18:32:04 2006
@@ -47,6 +47,7 @@
 import sys
 import codecs
 import os
+from BeautifulSoup import BeautifulSoup
 
 import config 
 import util
@@ -54,7 +55,7 @@
 from kaa.metadata.disc.discinfo import cdrom_disc_id
 #Constants
 
-freevo_version = '1.3.4'
+freevo_version = '1.6.0'
 
 imdb_title_list = '/tmp/imdb-movies.list'
 imdb_title_list_url = 
'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/movies.list.gz'
@@ -302,21 +303,23 @@
     
     def writeFxd(self):
         """Write fxd file"""
+        print 'in writeFxd'
         #if fxdfile is empty, set it yourself
         if not self.fxdfile:
             self.setFxdFile()
         
         try:
             #should we add to an existing file?
-            if self.append == True :
-                if self.isdiscset == True:
+            if self.append:
+                if self.isdiscset:
                     self.update_discset()
-                else: self.update_movie()
+                else:
+                    self.update_movie()
             else:
                 #fetch images
                 self.fetch_image()
                 #should we write a disc-set ?
-                if self.isdiscset == True:
+                if self.isdiscset:
                     self.write_discset()
                 else:
                     self.write_movie()
@@ -412,7 +415,7 @@
                 "    The information in this file are from the Internet " +
                 "Movie Database (IMDb).\n" +
                 "    Please visit http://www.imdb.com for more 
informations.\n")
-        i.write("    <source url=\"http://www.imdb.com/Title?%s\"/>\n"  % 
self.imdb_id +
+        i.write("    <source url=\"http://www.imdb.com/tt%s\"/>\n"  % 
self.imdb_id +
                 "  </copyright>\n")
         #disc-set    
         i.write("  <disc-set title=\"%s\">\n" % self.str2XML(self.title))
@@ -597,114 +600,64 @@
         util.touch(os.path.join(config.FREEVO_CACHEDIR, 
'freevo-rebuild-database'))
 
 
-
-    
     def parsedata(self, results, id=0):
         """results (imdb html page), imdb_id
         Returns tuple of (title, info(dict), image_urls)"""
 
         dvd = 0
 
-        # This is split across two lines, as the code is regexp and should be 
an XML parser
-        # this has made it far more difficult to repair. using the page title 
_SHOULD_ work
-        #   - Karl Lattimer
-        #regexp_title   = re.compile('.*STRONG CLASS="title">(.*?)<', re.I)
-        regexp_title   = re.compile('<title>(.*?) \(.*\)</title>', re.I)
-        regexp_year    = re.compile('.*<A HREF="/Sections/Years/.*?([0-9]*)<', 
re.I)
-        regexp_genre   = re.compile('.*href="/Sections/Genres(.*)$', re.I)
-        regexp_tagline = re.compile('.*<B CLASS="ch">Tagline.*?</B>(.*?)<', 
re.I)
-        regexp_plot1   = re.compile('.*<B CLASS="ch">Plot 
Outline.*?</B>(.*?)<', re.I)
-        regexp_plot2   = re.compile('.*<B CLASS="ch">Plot 
Summary.*?</B>(.*?)<', re.I)
-        regexp_rating  = re.compile('.*<B>([0-9\.]*)/10</B> (.[0-9,]* 
votes.?)', re.I)
-        regexp_image   = re.compile('.*ALT="cover".*src="(http://.*?)"', re.I)
-        regexp_runtime = re.compile('.*<b class="ch">Runtime', re.I)
-        regexp_dvd     = re.compile('.*<a href="/DVD\?', re.I)
-    
-        regexp_dvd_image = 
re.compile('.*(http://images.amazon.com.*?ZZZZZ.*?)"')
-        regexp_url   = re.compile('.*href="(http.*?)"', re.I)
-    
-        next_line_is = None
-    
-        for line in results.read().split("\n"):
-            if next_line_is == 'runtime':
-                next_line_is = None
-                self.info['runtime'] = self.str2XML(line)
-    
-            if regexp_runtime.match(line):
-                next_line_is = 'runtime'
-                continue
-    
-            m = regexp_title.match(line)
-            if m: self.title = self.str2XML(m.group(1))
-    
-            m = regexp_year.match(line)
-            if m: self.info['year'] = m.group(1)
-    
-            m = regexp_genre.match(line)
-            if m:
-                for g in re.compile(' *</A>.*?> *', re.I).split(' </a>'+line+' 
> '):
-                    if self.info['genre'] == "": self.info['genre'] = g
-                    elif g != "" and g != "(more)": self.info['genre'] += " / 
"+ g
-    
-    
-            m = regexp_tagline.match('%s<' % line)
-            if m:
-                self.info['tagline'] = self.str2XML(re.compile('[\t ]+').sub(" 
", ' ' + m.group(1))[1:])
-    
-            m = regexp_plot1.match('%s<' % line)
-            if m: self.info['plot'] = self.str2XML(re.compile('[\t ]+').sub(" 
", ' ' + m.group(1))[1:])
-    
-            m = regexp_plot2.match('%s<' % line)
-            if m: self.info['plot'] = self.str2XML(re.compile('[\t ]+').sub(" 
", ' ' + m.group(1))[1:])
-    
-            m = regexp_rating.match(line)
-            if m: self.info['rating'] = m.group(1) + '/10 ' + m.group(2)
-    
-            m = regexp_dvd.match(line)
-            if m: dvd = 1
-    
-            m = regexp_image.match(line)
-            if m: self.image_urls += [ m.group(1) ]
-    
+        soup = BeautifulSoup()
+        soup.feed(results.read())
     
+        title = soup.find('strong', {'class':'title'})
+        image = soup.find('img', { 'title':title.next.strip() })
+
+        self.title = title.next.strip()
+        self.info['title'] = self.title
+        self.info['image'] = image['src']
+        self.info['year'] = title.find('a').string.strip()
+        self.info['plot'] = soup.find(text='Plot Outline:').next.strip()
+        self.info['tagline'] = soup.find(text='Tagline:').next.strip()
+        self.info['genre'] = ''
+        genre=soup.find(text='Genre:').parent
+        genres = []
+        while genre.findNextSibling('a').string != '(more)':
+            genres.append(genre.findNextSibling('a').string.strip())
+            genre=genre.findNextSibling('a')
+        self.info['genre'] = genres[0]
+        for i in genres[1:]:
+            self.info['genre'] += ' / ' + i
+        rating = soup.find(text='User Rating:').parent.findNextSibling('b')
+        votes = rating.next.next.strip()
+        self.info['rating'] = rating.string.strip() + ' ' + votes.strip()
+        self.info['runtime'] = soup.find(text='Runtime:').next.strip()
+        for (k,v) in self.info.items():
+            print k, ':', v
+
+        print 'id:', id, 'dvd:', dvd
+
         if not id:
             return (self.title, self.info, self.image_urls)
     
-    
-        if dvd:
-            url = 'http://us.imdb.com/DVD?%s' % id
+        if not dvd:
+            url = 'http://us.imdb.com/title/tt%s/dvd' % id
+            print 'url:', url
             req = urllib2.Request(url, txdata, txheaders)
             
             try:
                 r = urllib2.urlopen(req)
-                for line in r.read().split("\n"):
-                    m = regexp_dvd_image.match(line)
-                    if m: self.image_urls += [ m.group(1) ]
+                soup.feed(r.read())
                 r.close()
+                divs = soup.findAll('table', { 'class' : 'dvd_section' })
+                for div in divs:
+                    image = div.find('img')
+                    if image['src'].find('http') < 0:
+                        continue
+                    self.image_urls += [ image['src'] ]
             except urllib2.HTTPError, error:
                 pass
+        print self.image_urls
     
-        #oldcode
-        #if not self.image_url_handler:
-        #    return #(title, info, image_urls)
-    
-        url = 'http://us.imdb.com/title/tt%s/posters' % id
-        req = urllib2.Request(url, txdata, txheaders)
-        try:
-            r = urllib2.urlopen(req)
-        except urllib2.HTTPError, error:
-            print error
-            return (self.title, self.info, self.image_urls)
-
-        data = r.read().replace('</a>', '\n').replace('</A>', '\n')
-        for line in data.split('\n'):
-            m = regexp_url.match(line)
-            if m:
-                url = urlparse.urlsplit(m.group(1))
-                if url[0] == 'http' and self.image_url_handler.has_key(url[1]):
-                    self.image_urls += self.image_url_handler[url[1]](url[1], 
url[2])
-        
-        r.close()
         return (self.title, self.info, self.image_urls)
     
     
@@ -719,13 +672,15 @@
     
     def fetch_image(self):
         """Fetch the best image"""
-        image_len = 0
+        print 'in fetch_image', self.image_urls
 
-        if (len(self.image_urls) == 0): # No images
+        image_len = 0
+        if len(self.image_urls) == 0: # No images
             return
 
         for image in self.image_urls:
             try:
+                print 'image:', image
                 # get sizes of images
                 req = urllib2.Request(image, txdata, txheaders)
                 r = urllib2.urlopen(req)

-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Freevo-cvslog mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/freevo-cvslog

Reply via email to