Bug#518436: python-urlgrabber: "newer" version available.

Mauro Lizaur Thu, 05 Mar 2009 19:37:29 -0800

Package: python-urlgrabber
Version: 3.1.0-4
Severity: normal

Please note that the CVS version is different (and newer) from the 
latest version released (3.1.0), also it has various fixes/improvements 
on its code.
I attach as an example a diff from the file "urlgrabber/grabber.py".



-- System Information:
Debian Release: 5.0
  APT prefers testing
  APT policy: (500, 'testing')
Architecture: amd64 (x86_64)

Kernel: Linux 2.6.26-1-amd64 (SMP w/2 CPU cores)
Locale: LANG=es_AR.UTF-8, LC_CTYPE=es_AR.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/bash

Versions of packages python-urlgrabber depends on:
ii  python-support                0.8.7      automated rebuilding support for P
ii  python2.5                     2.5.4-1    An interactive high-level object-o

python-urlgrabber recommends no packages.

python-urlgrabber suggests no packages.

-- no debconf information

-- 
JID: [email protected] | http://lusers.com.ar/
2B82 A38D 1BA5 847A A74D 6C34 6AB7 9ED6 C8FD F9C1

--- /tmp/urlgrabber-3.1.0/urlgrabber/grabber.py	2006-09-21 21:58:05.000000000 -0300
+++ /tmp/urlgrabber-cvs/urlgrabber/grabber.py	2006-12-12 16:08:46.000000000 -0300
@@ -55,8 +55,9 @@
 
   text = None
   
-    specifies an alternativ text item in the beginning of the progress
-    bar line. If not given, the basename of the file is used.
+    specifies alternative text to be passed to the progress meter
+    object.  If not given, the default progress meter will use the
+    basename of the file.
 
   throttle = 1.0
 
@@ -167,6 +168,13 @@
     chain integrity.  You are responsible for ensuring that any
     extension handlers are present if said features are required.
     
+  cache_openers = True
+
+    controls whether urllib2 openers should be cached and reused, or
+    whether they should be created each time.  There's a modest
+    overhead in recreating them, but it's slightly safer to do so if
+    you're modifying the handlers between calls.
+
   data = None
 
     Only relevant for the HTTP family (and ignored for other
@@ -179,6 +187,44 @@
     badly and if you do not use the proper case (shown here), your
     values will be overridden with the defaults.
     
+  urlparser = URLParser()
+
+    The URLParser class handles pre-processing of URLs, including
+    auth-handling for user/pass encoded in http urls, file handing
+    (that is, filenames not sent as a URL), and URL quoting.  If you
+    want to override any of this behavior, you can pass in a
+    replacement instance.  See also the 'quote' option.
+
+  quote = None
+
+    Whether or not to quote the path portion of a url.
+      quote = 1    ->  quote the URLs (they're not quoted yet)
+      quote = 0    ->  do not quote them (they're already quoted)
+      quote = None ->  guess what to do
+
+    This option only affects proper urls like 'file:///etc/passwd'; it
+    does not affect 'raw' filenames like '/etc/passwd'.  The latter
+    will always be quoted as they are converted to URLs.  Also, only
+    the path part of a url is quoted.  If you need more fine-grained
+    control, you should probably subclass URLParser and pass it in via
+    the 'urlparser' option.
+
+  ssl_ca_cert = None
+
+    this option can be used if M2Crypto is available and will be
+    ignored otherwise.  If provided, it will be used to create an SSL
+    context.  If both ssl_ca_cert and ssl_context are provided, then
+    ssl_context will be ignored and a new context will be created from
+    ssl_ca_cert.
+
+  ssl_context = None
+
+    this option can be used if M2Crypto is available and will be
+    ignored otherwise.  If provided, this SSL context will be used.
+    If both ssl_ca_cert and ssl_context are provided, then ssl_context
+    will be ignored and a new context will be created from
+    ssl_ca_cert.
+    
 
 RETRY RELATED ARGUMENTS
 
@@ -283,28 +329,6 @@
     passed the same arguments, so you could use the same function for
     both.
       
-  urlparser = URLParser()
-
-    The URLParser class handles pre-processing of URLs, including
-    auth-handling for user/pass encoded in http urls, file handing
-    (that is, filenames not sent as a URL), and URL quoting.  If you
-    want to override any of this behavior, you can pass in a
-    replacement instance.  See also the 'quote' option.
-
-  quote = None
-
-    Whether or not to quote the path portion of a url.
-      quote = 1    ->  quote the URLs (they're not quoted yet)
-      quote = 0    ->  do not quote them (they're already quoted)
-      quote = None ->  guess what to do
-
-    This option only affects proper urls like 'file:///etc/passwd'; it
-    does not affect 'raw' filenames like '/etc/passwd'.  The latter
-    will always be quoted as they are converted to URLs.  Also, only
-    the path part of a url is quoted.  If you need more fine-grained
-    control, you should probably subclass URLParser and pass it in via
-    the 'urlparser' option.
-
 BANDWIDTH THROTTLING
 
   urlgrabber supports throttling via two values: throttle and
@@ -364,7 +388,7 @@
 
 """
 
-# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $
+# $Id: grabber.py,v 1.52 2006/12/12 19:08:46 mstenner Exp $
 
 import os
 import os.path
@@ -375,6 +399,7 @@
 import string
 import urllib
 import urllib2
+import thread
 from stat import *  # S_* and ST_*
 
 ########################################################################
@@ -406,8 +431,10 @@
     import keepalive
     from keepalive import HTTPHandler, HTTPSHandler
     have_keepalive = True
+    keepalive_http_handler = HTTPHandler()
 except ImportError, msg:
     have_keepalive = False
+    keepalive_http_handler = None
 
 try:
     # add in range support conditionally too
@@ -463,7 +490,7 @@
     if sslfactory.DEBUG is None:
         sslfactory.DEBUG = DBOBJ
 
-def _init_default_logger():
+def _init_default_logger(logspec=None):
     '''Examines the environment variable URLGRABBER_DEBUG and creates
     a logging object (logging.logger) based on the contents.  It takes
     the form
@@ -489,9 +516,12 @@
     collect the code into a nice block.'''
 
     try:
-        dbinfo = os.environ['URLGRABBER_DEBUG'].split(',')
+        if logspec is None:
+            logspec = os.environ['URLGRABBER_DEBUG']
+        dbinfo = logspec.split(',')
         import logging
-        level = logging._levelNames.get(dbinfo[0], int(dbinfo[0]))
+        level = logging._levelNames.get(dbinfo[0], None)
+        if level is None: level = int(dbinfo[0])
         if level < 1: raise ValueError()
 
         formatter = logging.Formatter('%(asctime)s %(message)s')
@@ -508,7 +538,17 @@
         DBOBJ = None
     set_logger(DBOBJ)
 
+def _log_package_state():
+    if not DEBUG: return
+    DEBUG.info('urlgrabber version  = %s' % __version__)
+    DEBUG.info('have_m2crypto       = %s' % sslfactory.have_m2crypto)
+    DEBUG.info('trans function "_"  = %s' % _)
+    DEBUG.info('have_keepalive      = %s' % have_keepalive)
+    DEBUG.info('have_range          = %s' % have_range)
+    DEBUG.info('have_socket_timeout = %s' % have_socket_timeout)
+
 _init_default_logger()
+_log_package_state()
 ########################################################################
 #                 END MODULE INITIALIZATION
 ########################################################################
@@ -536,6 +576,7 @@
         13   - malformed proxy url
         14   - HTTPError (includes .code and .exception attributes)
         15   - user abort
+        16   - error writing to local file
         
       MirrorGroup error codes (256 -- 511)
         256  - No more mirrors left to try
@@ -811,6 +852,24 @@
         self.ssl_ca_cert = None
         self.ssl_context = None
 
+    def __repr__(self):
+        return self.format()
+        
+    def format(self, indent='  '):
+        keys = self.__dict__.keys()
+        if self.delegate is not None:
+            keys.remove('delegate')
+        keys.sort()
+        s = '{\n'
+        for k in keys:
+            s = s + indent + '%-15s: %s,\n' % \
+                (repr(k), repr(self.__dict__[k]))
+        if self.delegate:
+            df = self.delegate.format(indent + '  ')
+            s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
+        s = s + indent + '}'
+        return s
+
 class URLGrabber:
     """Provides easy opening of URLs with a variety of options.
     
@@ -878,6 +937,7 @@
         like any other file object.
         """
         opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
         def retryfunc(opts, url):
             return URLGrabberFileObject(url, filename=None, opts=opts)
@@ -890,6 +950,7 @@
         different from the passed-in filename if copy_local == 0.
         """
         opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
         (scheme, host, path, parm, query, frag) = parts
         if filename is None:
@@ -934,6 +995,7 @@
         into memory, but don't use too much'
         """
         opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
         if limit is not None:
             limit = limit + 1
@@ -1021,7 +1083,7 @@
             # it _must_ come before all other handlers in the list or urllib2
             # chokes.
             if self.opts.proxies:
-                handlers.append( CachedProxyHandler(self.opts.proxies) )
+                handlers.append( _proxy_handler_cache.get(self.opts.proxies) )
 
                 # -------------------------------------------------------
                 # OK, these next few lines are a serious kludge to get
@@ -1044,19 +1106,19 @@
                     handlers.append( urllib2.FTPHandler() )
                 # -------------------------------------------------------
 
-            ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert,
-                self.opts.ssl_context)
 
+            ssl_factory = _ssl_factory_cache.get( (self.opts.ssl_ca_cert,
+                                                   self.opts.ssl_context) )
             if need_keepalive_handler:
-                handlers.append(HTTPHandler())
-                handlers.append(HTTPSHandler(ssl_factory))
+                handlers.append(keepalive_http_handler)
+                handlers.append(_https_handler_cache.get(ssl_factory))
             if need_range_handler:
                 handlers.extend( range_handlers )
             handlers.append( auth_handler )
             if self.opts.cache_openers:
-                self._opener = CachedOpenerDirector(ssl_factory, *handlers)
+                self._opener = _opener_cache.get([ssl_factory,] + handlers)
             else:
-                self._opener = ssl_factory.create_opener(*handlers)
+                self._opener = _opener_cache.create([ssl_factory,] + handlers)
             # OK, I don't like to do this, but otherwise, we end up with
             # TWO user-agent headers.
             self._opener.addheaders = []
@@ -1196,15 +1258,35 @@
         
     def _do_grab(self):
         """dump the file to self.filename."""
-        if self.append: new_fo = open(self.filename, 'ab')
-        else: new_fo = open(self.filename, 'wb')
+        if self.append: mode = 'ab'
+        else: mode = 'wb'
+        if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+                             (self.filename, mode))
+        try:
+            new_fo = open(self.filename, mode)
+        except IOError, e:
+            raise URLGrabError(16, _(\
+                'error opening local file, IOError: %s') % (e, ))
+
+        try:
+            # if we have a known range, only try to read that much.
+            (low, high) = self.opts.range
+            amount = high - low
+        except TypeError, ValueError:
+            amount = None
         bs = 1024*8
         size = 0
 
+        if amount is not None: bs = min(bs, amount - size)
         block = self.read(bs)
         size = size + len(block)
         while block:
-            new_fo.write(block)
+            try:
+                new_fo.write(block)
+            except IOError, e:
+                raise URLGrabError(16, _(\
+                    'error writing to local file, IOError: %s') % (e, ))
+            if amount is not None: bs = min(bs, amount - size)
             block = self.read(bs)
             size = size + len(block)
 
@@ -1300,36 +1382,96 @@
             try: self.fo.close_connection()
             except: pass
 
-_handler_cache = []
-def CachedOpenerDirector(ssl_factory = None, *handlers):
-    for (cached_handlers, opener) in _handler_cache:
-        if cached_handlers == handlers:
-            for handler in opener.handlers:
-                handler.add_parent(opener)
-            return opener
-    if not ssl_factory:
-        ssl_factory = sslfactory.get_factory()
-    opener = ssl_factory.create_opener(*handlers)
-    _handler_cache.append( (handlers, opener) )
-    return opener
+#####################################################################
 
-_proxy_cache = []
-def CachedProxyHandler(proxies):
-    for (pdict, handler) in _proxy_cache:
-        if pdict == proxies:
-            if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies)
-            break
-    else:
+class NoDefault: pass
+class ObjectCache:
+    def __init__(self, name=None):
+        self.name = name or self.__class__.__name__
+        self._lock = thread.allocate_lock()
+        self._cache = []
+
+    def lock(self):
+        self._lock.acquire()
+
+    def unlock(self):
+        self._lock.release()
+            
+    def get(self, key, create=None, found=None):
+        for (k, v) in self._cache:
+            if k == key:
+                if DEBUG:
+                    DEBUG.debug('%s: found key' % self.name)
+                    DEBUG.debug('%s: key = %s' % (self.name, key))
+                    DEBUG.debug('%s: val = %s' % (self.name, v))
+                found = found or getattr(self, 'found', None)
+                if found: v = found(key, v)
+                return v
+        if DEBUG:
+            DEBUG.debug('%s: no key found' % self.name)
+            DEBUG.debug('%s: key = %s' % (self.name, key))
+        create = create or getattr(self, 'create', None)
+        if create:
+            value = create(key)
+            if DEBUG:
+                DEBUG.info('%s: new value created' % self.name)
+                DEBUG.debug('%s: val = %s' % (self.name, value))
+            self._cache.append( (key, value) )
+            return value
+        else:
+            raise KeyError('key not found: %s' % key)
+
+    def set(self, key, value):
+        if DEBUG:
+            DEBUG.info('%s: inserting key' % self.name)
+            DEBUG.debug('%s: key = %s' % (self.name, key))
+            DEBUG.debug('%s: val = %s' % (self.name, value))
+        self._cache.append( (key, value) )
+
+    def ts_get(self, key, create=None, found=None):
+        self._lock.acquire()
+        try:
+            self.get(key, create, found)
+        finally:
+            self._lock.release()
+        
+    def ts_set(self, key, value):
+        self._lock.acquire()
+        try:
+            self.set(key, value)
+        finally:
+            self._lock.release()
+
+class OpenerCache(ObjectCache):
+    def found(self, factory_and_handlers, opener):
+        for handler in factory_and_handlers[1:]:
+            handler.add_parent(opener)
+        return opener
+    def create(self, factory_and_handlers):
+        factory = factory_and_handlers[0]
+        handlers = factory_and_handlers[1:]
+        return factory.create_opener(*handlers)
+_opener_cache = OpenerCache()
+
+class ProxyHandlerCache(ObjectCache):
+    def create(self, proxies):
         for k, v in proxies.items():
             utype, url = urllib.splittype(v)
             host, other = urllib.splithost(url)
             if (utype is None) or (host is None):
                 raise URLGrabError(13, _('Bad proxy URL: %s') % v)
+        return urllib2.ProxyHandler(proxies)
+_proxy_handler_cache = ProxyHandlerCache()
 
-        if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies)
-        handler = urllib2.ProxyHandler(proxies)
-        _proxy_cache.append( (proxies, handler) )
-    return handler
+class HTTPSHandlerCache(ObjectCache):
+    def create(self, ssl_factory):
+        return HTTPSHandler(ssl_factory)
+_https_handler_cache = HTTPSHandlerCache()
+
+class SSLFactoryCache(ObjectCache):
+    def create(self, cert_and_context):
+        return sslfactory.get_factory(*cert_and_context)
+_ssl_factory_cache = SSLFactoryCache()
 
 #####################################################################
 # DEPRECATED FUNCTIONS

Bug#518436: python-urlgrabber: "newer" version available.

Reply via email to