Package: python-urlgrabber
Version: 3.1.0-4
Severity: normal
Please note that the CVS version is different (and newer) from the
latest version released (3.1.0), also it has various fixes/improvements
on its code.
I attach as an example a diff from the file "urlgrabber/grabber.py".
-- System Information:
Debian Release: 5.0
APT prefers testing
APT policy: (500, 'testing')
Architecture: amd64 (x86_64)
Kernel: Linux 2.6.26-1-amd64 (SMP w/2 CPU cores)
Locale: LANG=es_AR.UTF-8, LC_CTYPE=es_AR.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/bash
Versions of packages python-urlgrabber depends on:
ii python-support 0.8.7 automated rebuilding support for P
ii python2.5 2.5.4-1 An interactive high-level object-o
python-urlgrabber recommends no packages.
python-urlgrabber suggests no packages.
-- no debconf information
--
JID: [email protected] | http://lusers.com.ar/
2B82 A38D 1BA5 847A A74D 6C34 6AB7 9ED6 C8FD F9C1
--- /tmp/urlgrabber-3.1.0/urlgrabber/grabber.py 2006-09-21 21:58:05.000000000 -0300
+++ /tmp/urlgrabber-cvs/urlgrabber/grabber.py 2006-12-12 16:08:46.000000000 -0300
@@ -55,8 +55,9 @@
text = None
- specifies an alternativ text item in the beginning of the progress
- bar line. If not given, the basename of the file is used.
+ specifies alternative text to be passed to the progress meter
+ object. If not given, the default progress meter will use the
+ basename of the file.
throttle = 1.0
@@ -167,6 +168,13 @@
chain integrity. You are responsible for ensuring that any
extension handlers are present if said features are required.
+ cache_openers = True
+
+ controls whether urllib2 openers should be cached and reused, or
+ whether they should be created each time. There's a modest
+ overhead in recreating them, but it's slightly safer to do so if
+ you're modifying the handlers between calls.
+
data = None
Only relevant for the HTTP family (and ignored for other
@@ -179,6 +187,44 @@
badly and if you do not use the proper case (shown here), your
values will be overridden with the defaults.
+ urlparser = URLParser()
+
+ The URLParser class handles pre-processing of URLs, including
+ auth-handling for user/pass encoded in http urls, file handing
+ (that is, filenames not sent as a URL), and URL quoting. If you
+ want to override any of this behavior, you can pass in a
+ replacement instance. See also the 'quote' option.
+
+ quote = None
+
+ Whether or not to quote the path portion of a url.
+ quote = 1 -> quote the URLs (they're not quoted yet)
+ quote = 0 -> do not quote them (they're already quoted)
+ quote = None -> guess what to do
+
+ This option only affects proper urls like 'file:///etc/passwd'; it
+ does not affect 'raw' filenames like '/etc/passwd'. The latter
+ will always be quoted as they are converted to URLs. Also, only
+ the path part of a url is quoted. If you need more fine-grained
+ control, you should probably subclass URLParser and pass it in via
+ the 'urlparser' option.
+
+ ssl_ca_cert = None
+
+ this option can be used if M2Crypto is available and will be
+ ignored otherwise. If provided, it will be used to create an SSL
+ context. If both ssl_ca_cert and ssl_context are provided, then
+ ssl_context will be ignored and a new context will be created from
+ ssl_ca_cert.
+
+ ssl_context = None
+
+ this option can be used if M2Crypto is available and will be
+ ignored otherwise. If provided, this SSL context will be used.
+ If both ssl_ca_cert and ssl_context are provided, then ssl_context
+ will be ignored and a new context will be created from
+ ssl_ca_cert.
+
RETRY RELATED ARGUMENTS
@@ -283,28 +329,6 @@
passed the same arguments, so you could use the same function for
both.
- urlparser = URLParser()
-
- The URLParser class handles pre-processing of URLs, including
- auth-handling for user/pass encoded in http urls, file handing
- (that is, filenames not sent as a URL), and URL quoting. If you
- want to override any of this behavior, you can pass in a
- replacement instance. See also the 'quote' option.
-
- quote = None
-
- Whether or not to quote the path portion of a url.
- quote = 1 -> quote the URLs (they're not quoted yet)
- quote = 0 -> do not quote them (they're already quoted)
- quote = None -> guess what to do
-
- This option only affects proper urls like 'file:///etc/passwd'; it
- does not affect 'raw' filenames like '/etc/passwd'. The latter
- will always be quoted as they are converted to URLs. Also, only
- the path part of a url is quoted. If you need more fine-grained
- control, you should probably subclass URLParser and pass it in via
- the 'urlparser' option.
-
BANDWIDTH THROTTLING
urlgrabber supports throttling via two values: throttle and
@@ -364,7 +388,7 @@
"""
-# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $
+# $Id: grabber.py,v 1.52 2006/12/12 19:08:46 mstenner Exp $
import os
import os.path
@@ -375,6 +399,7 @@
import string
import urllib
import urllib2
+import thread
from stat import * # S_* and ST_*
########################################################################
@@ -406,8 +431,10 @@
import keepalive
from keepalive import HTTPHandler, HTTPSHandler
have_keepalive = True
+ keepalive_http_handler = HTTPHandler()
except ImportError, msg:
have_keepalive = False
+ keepalive_http_handler = None
try:
# add in range support conditionally too
@@ -463,7 +490,7 @@
if sslfactory.DEBUG is None:
sslfactory.DEBUG = DBOBJ
-def _init_default_logger():
+def _init_default_logger(logspec=None):
'''Examines the environment variable URLGRABBER_DEBUG and creates
a logging object (logging.logger) based on the contents. It takes
the form
@@ -489,9 +516,12 @@
collect the code into a nice block.'''
try:
- dbinfo = os.environ['URLGRABBER_DEBUG'].split(',')
+ if logspec is None:
+ logspec = os.environ['URLGRABBER_DEBUG']
+ dbinfo = logspec.split(',')
import logging
- level = logging._levelNames.get(dbinfo[0], int(dbinfo[0]))
+ level = logging._levelNames.get(dbinfo[0], None)
+ if level is None: level = int(dbinfo[0])
if level < 1: raise ValueError()
formatter = logging.Formatter('%(asctime)s %(message)s')
@@ -508,7 +538,17 @@
DBOBJ = None
set_logger(DBOBJ)
+def _log_package_state():
+ if not DEBUG: return
+ DEBUG.info('urlgrabber version = %s' % __version__)
+ DEBUG.info('have_m2crypto = %s' % sslfactory.have_m2crypto)
+ DEBUG.info('trans function "_" = %s' % _)
+ DEBUG.info('have_keepalive = %s' % have_keepalive)
+ DEBUG.info('have_range = %s' % have_range)
+ DEBUG.info('have_socket_timeout = %s' % have_socket_timeout)
+
_init_default_logger()
+_log_package_state()
########################################################################
# END MODULE INITIALIZATION
########################################################################
@@ -536,6 +576,7 @@
13 - malformed proxy url
14 - HTTPError (includes .code and .exception attributes)
15 - user abort
+ 16 - error writing to local file
MirrorGroup error codes (256 -- 511)
256 - No more mirrors left to try
@@ -811,6 +852,24 @@
self.ssl_ca_cert = None
self.ssl_context = None
+ def __repr__(self):
+ return self.format()
+
+ def format(self, indent=' '):
+ keys = self.__dict__.keys()
+ if self.delegate is not None:
+ keys.remove('delegate')
+ keys.sort()
+ s = '{\n'
+ for k in keys:
+ s = s + indent + '%-15s: %s,\n' % \
+ (repr(k), repr(self.__dict__[k]))
+ if self.delegate:
+ df = self.delegate.format(indent + ' ')
+ s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
+ s = s + indent + '}'
+ return s
+
class URLGrabber:
"""Provides easy opening of URLs with a variety of options.
@@ -878,6 +937,7 @@
like any other file object.
"""
opts = self.opts.derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
def retryfunc(opts, url):
return URLGrabberFileObject(url, filename=None, opts=opts)
@@ -890,6 +950,7 @@
different from the passed-in filename if copy_local == 0.
"""
opts = self.opts.derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts
if filename is None:
@@ -934,6 +995,7 @@
into memory, but don't use too much'
"""
opts = self.opts.derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
if limit is not None:
limit = limit + 1
@@ -1021,7 +1083,7 @@
# it _must_ come before all other handlers in the list or urllib2
# chokes.
if self.opts.proxies:
- handlers.append( CachedProxyHandler(self.opts.proxies) )
+ handlers.append( _proxy_handler_cache.get(self.opts.proxies) )
# -------------------------------------------------------
# OK, these next few lines are a serious kludge to get
@@ -1044,19 +1106,19 @@
handlers.append( urllib2.FTPHandler() )
# -------------------------------------------------------
- ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert,
- self.opts.ssl_context)
+ ssl_factory = _ssl_factory_cache.get( (self.opts.ssl_ca_cert,
+ self.opts.ssl_context) )
if need_keepalive_handler:
- handlers.append(HTTPHandler())
- handlers.append(HTTPSHandler(ssl_factory))
+ handlers.append(keepalive_http_handler)
+ handlers.append(_https_handler_cache.get(ssl_factory))
if need_range_handler:
handlers.extend( range_handlers )
handlers.append( auth_handler )
if self.opts.cache_openers:
- self._opener = CachedOpenerDirector(ssl_factory, *handlers)
+ self._opener = _opener_cache.get([ssl_factory,] + handlers)
else:
- self._opener = ssl_factory.create_opener(*handlers)
+ self._opener = _opener_cache.create([ssl_factory,] + handlers)
# OK, I don't like to do this, but otherwise, we end up with
# TWO user-agent headers.
self._opener.addheaders = []
@@ -1196,15 +1258,35 @@
def _do_grab(self):
"""dump the file to self.filename."""
- if self.append: new_fo = open(self.filename, 'ab')
- else: new_fo = open(self.filename, 'wb')
+ if self.append: mode = 'ab'
+ else: mode = 'wb'
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+ (self.filename, mode))
+ try:
+ new_fo = open(self.filename, mode)
+ except IOError, e:
+ raise URLGrabError(16, _(\
+ 'error opening local file, IOError: %s') % (e, ))
+
+ try:
+ # if we have a known range, only try to read that much.
+ (low, high) = self.opts.range
+ amount = high - low
+ except TypeError, ValueError:
+ amount = None
bs = 1024*8
size = 0
+ if amount is not None: bs = min(bs, amount - size)
block = self.read(bs)
size = size + len(block)
while block:
- new_fo.write(block)
+ try:
+ new_fo.write(block)
+ except IOError, e:
+ raise URLGrabError(16, _(\
+ 'error writing to local file, IOError: %s') % (e, ))
+ if amount is not None: bs = min(bs, amount - size)
block = self.read(bs)
size = size + len(block)
@@ -1300,36 +1382,96 @@
try: self.fo.close_connection()
except: pass
-_handler_cache = []
-def CachedOpenerDirector(ssl_factory = None, *handlers):
- for (cached_handlers, opener) in _handler_cache:
- if cached_handlers == handlers:
- for handler in opener.handlers:
- handler.add_parent(opener)
- return opener
- if not ssl_factory:
- ssl_factory = sslfactory.get_factory()
- opener = ssl_factory.create_opener(*handlers)
- _handler_cache.append( (handlers, opener) )
- return opener
+#####################################################################
-_proxy_cache = []
-def CachedProxyHandler(proxies):
- for (pdict, handler) in _proxy_cache:
- if pdict == proxies:
- if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies)
- break
- else:
+class NoDefault: pass
+class ObjectCache:
+ def __init__(self, name=None):
+ self.name = name or self.__class__.__name__
+ self._lock = thread.allocate_lock()
+ self._cache = []
+
+ def lock(self):
+ self._lock.acquire()
+
+ def unlock(self):
+ self._lock.release()
+
+ def get(self, key, create=None, found=None):
+ for (k, v) in self._cache:
+ if k == key:
+ if DEBUG:
+ DEBUG.debug('%s: found key' % self.name)
+ DEBUG.debug('%s: key = %s' % (self.name, key))
+ DEBUG.debug('%s: val = %s' % (self.name, v))
+ found = found or getattr(self, 'found', None)
+ if found: v = found(key, v)
+ return v
+ if DEBUG:
+ DEBUG.debug('%s: no key found' % self.name)
+ DEBUG.debug('%s: key = %s' % (self.name, key))
+ create = create or getattr(self, 'create', None)
+ if create:
+ value = create(key)
+ if DEBUG:
+ DEBUG.info('%s: new value created' % self.name)
+ DEBUG.debug('%s: val = %s' % (self.name, value))
+ self._cache.append( (key, value) )
+ return value
+ else:
+ raise KeyError('key not found: %s' % key)
+
+ def set(self, key, value):
+ if DEBUG:
+ DEBUG.info('%s: inserting key' % self.name)
+ DEBUG.debug('%s: key = %s' % (self.name, key))
+ DEBUG.debug('%s: val = %s' % (self.name, value))
+ self._cache.append( (key, value) )
+
+ def ts_get(self, key, create=None, found=None):
+ self._lock.acquire()
+ try:
+ self.get(key, create, found)
+ finally:
+ self._lock.release()
+
+ def ts_set(self, key, value):
+ self._lock.acquire()
+ try:
+ self.set(key, value)
+ finally:
+ self._lock.release()
+
+class OpenerCache(ObjectCache):
+ def found(self, factory_and_handlers, opener):
+ for handler in factory_and_handlers[1:]:
+ handler.add_parent(opener)
+ return opener
+ def create(self, factory_and_handlers):
+ factory = factory_and_handlers[0]
+ handlers = factory_and_handlers[1:]
+ return factory.create_opener(*handlers)
+_opener_cache = OpenerCache()
+
+class ProxyHandlerCache(ObjectCache):
+ def create(self, proxies):
for k, v in proxies.items():
utype, url = urllib.splittype(v)
host, other = urllib.splithost(url)
if (utype is None) or (host is None):
raise URLGrabError(13, _('Bad proxy URL: %s') % v)
+ return urllib2.ProxyHandler(proxies)
+_proxy_handler_cache = ProxyHandlerCache()
- if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies)
- handler = urllib2.ProxyHandler(proxies)
- _proxy_cache.append( (proxies, handler) )
- return handler
+class HTTPSHandlerCache(ObjectCache):
+ def create(self, ssl_factory):
+ return HTTPSHandler(ssl_factory)
+_https_handler_cache = HTTPSHandlerCache()
+
+class SSLFactoryCache(ObjectCache):
+ def create(self, cert_and_context):
+ return sslfactory.get_factory(*cert_and_context)
+_ssl_factory_cache = SSLFactoryCache()
#####################################################################
# DEPRECATED FUNCTIONS