Oops. This was sent to me some (long) time ago. Anybody want to handle it? I really don't have time anymore to look into the parser code unless it is to scratch an own itch...
Holger
---------- Forwarded message ---------- Date: Fri, 01 Aug 2003 18:00:36 -0500 From: Dario Morales Lopez <[EMAIL PROTECTED]> To: [EMAIL PROTECTED] X-Spambayes-Classification: ham; 0.00 Subject: Url.py replacing spaces in the url
Hi, I found your e-mail in the authors page from the plukers users guide, since I don't want to subscribe to the mailing lists I think you are the person who sould get this.
Attached is a file with a small modiffication that replaces space chars with url encoded spaces (%20). Since I'm new to plucker and to python I don't really know if it is the place to do the replacing and neither if it is the best way to do it, but it's a quick hack and it works for me.
I've done this because I'm switching from AvantGo to Pluker and was adding my local cinema showtimes page to plucker, but while parsing the first page plucker exited with an aufull invalid request error, I've realized that the error occurred because the page contains "hrefs" with spaces in them. Since the page is intended to be use with AvantGo and not with plucker asking the web page mantainer to change their invalid urls was out of question.
I hope it's usefull in some way and thank you four your very good efforts bringing plucker to the world.
-- Dar�o Morales L�pez [EMAIL PROTECTED] ------------------- "UNIX was not designed to stop you from doing stupid things, because that would also stop you from doing clever things." - Doug Gwyn
#!/usr/bin/env python
"""
Url.py $Id: Url.py,v 1.17 2003/03/27 02:55:19 chrish Exp $
Utility class to encapsulate information about an URL and the useful
operations thereon.
Copyright 1999, 2000 by Holger Duerer <[EMAIL PROTECTED]>
Distributable under the GNU General Public License Version 2 or newer.
"""
import urlparse, urllib, string, sys, os
urlparse.uses_relative.append ('plucker')
urlparse.uses_netloc.append ('plucker')
urlparse.uses_params.append ('plucker')
urlparse.uses_query.append ('plucker')
urlparse.uses_fragment.append ('plucker')
######################################################################
# Replacement for the urlparse lib, because this is buggy on Windows #
######################################################################
def windows_file_url_parse (url):
prot='file'
fragment=''
i = string.rfind(url, '#')
if i >= 0:
fragment = url[i+1:]
url = url[:i]
path=url
if string.lower(path[0:7]) == 'file://':
path=path[7:]
if string.lower(path[0:5]) == 'file:':
path=path[5:]
if ((string.upper(path[0:1]) >= 'A') and (string.upper(path[0:1]) <= 'Z')) and
(path[1:2] == ':'):
path = string.upper(path[0:1]) + path[1:]
host=''
params=''
query=''
return prot, host, path, params, query, fragment
######################################################################
# Replacement for the urlparse lib, because this is buggy on Windows #
######################################################################
def windows_file_urljoin(base, url):
def add_fragment(path, frag):
if frag != '':
res = path + '#' + frag
else:
res = path
return res
i = string.find(url, ':')
# a new http:// file:// not based to source is _not_ used
if (i < 3) or (i > 10):
(prot, host, path, params, query, fragment) = windows_file_url_parse (url)
if path != '':
######################################
# FIX ME!!!! #
# path like .\test\..\images\ #
# are not work yet! #
######################################
# .\file.ext == file.ext
if (path[0:2] == '.\\') or (path[0:2] == './'):
path = path[2:]
url = os.path.join (os.path.dirname(str (base)), add_fragment(path,
fragment))
return url
# one dir up
if (path[0:3] == '..\\') or (path[0:3] == '../'):
path = path[3:]
url = os.path.join (os.path.dirname(os.path.dirname(str (base))),
add_fragment(path, fragment))
return url
# two dir up
if (path[0:4] == '...\\') or (path[0:4] == '.../'):
path = path[4:]
url = os.path.join
(os.path.dirname(os.path.dirname(os.path.dirname(str (base)))), add_fragment(path,
fragment))
return url
# Root dir
if (path[0:1] == '\\') or (path[0:1] == '/'):
path = path[1:]
str_base = str (base)
url = os.path.join ('file:' + str_base[5] + ':' , add_fragment(path,
fragment))
return url
# normale case
else:
url = os.path.join (os.path.dirname(str (base)), add_fragment(path,
fragment))
return url
else:
url = base + '#' + fragment
return url
else:
return url
return url
######################################################################
# Replacement for the urlparse lib, because this is buggy on Windows #
# And its behavior changed in Python 2.2.2 CRH
######################################################################
def plucker_file_urlunparse(protocol, host, path, params, query, fragment):
text = ''
if protocol != '':
text = text + protocol + ':' + path
if fragment != '':
text = text + '#' + fragment
return text
class URL:
"""Encapsulate some useful things from urllib and urlparse"""
def __init__ (self, url, base = None):
if isinstance (url, URL) and base is None:
# Simple copy constructor: make it more efficient
self._protocol = url._protocol
self._host = url._host
self._path = url._path
self._params = url._params
self._query = url._query
self._fragment = url._fragment
else:
url = str (url)
if base is not None:
if sys.platform == 'win32' and string.lower(str (base)[0:5]) ==
'file:':
url = windows_file_urljoin (str (base), url)
else:
url = urlparse.urljoin (str (base), url)
# according to RFC 2396, this 'unquote' is inappropriate
# according to the HTML 4.01 spec, this 'unquote' is unnecessary
# url = urllib.unquote (url)
if sys.platform == 'win32' and string.lower(url[0:5]) == 'file:':
(prot, host, path, params, query, fragment) = windows_file_url_parse
(url)
else:
(prot, host, path, params, query, fragment) = urlparse.urlparse (url)
host = string.lower (host)
self._protocol = prot
self._host = host
self._path = path
self._params = params
self._query = query
self._fragment = fragment
def as_string (self, with_fragment):
if with_fragment:
fragment = self._fragment
else:
fragment = ""
if self._protocol == 'plucker' or self._protocol == 'file':
text = plucker_file_urlunparse (self._protocol,
self._host,
self._path,
self._params,
self._query,
fragment)
else:
text = urlparse.urlunparse ((self._protocol,
self._host,
self._path,
self._params,
self._query,
fragment))
return text
def __str__ (self):
return self.as_string (with_fragment=1)
def __repr__ (self):
return "URL (%s)" % repr (self.as_string (with_fragment=1))
def get_protocol (self):
return self._protocol
def get_host (self):
return self._host
def get_path (self):
return self._path
def get_fragment (self):
return self._fragment
def get_full_path (self, with_fragment):
if with_fragment:
fragment = self._fragment
else:
fragment = ""
if sys.platform == 'win32' and self._protocol == 'file':
text = plucker_file_urlunparse ("",
"",
self._path,
self._params,
self._query,
fragment)
else:
text = urlparse.urlunparse (("",
"",
self._path,
self._params,
self._query,
fragment))
return text
def remove_fragment (self):
self._fragment = ""
def CleanURL (url, base=None):
"""Remove leading and trailing white space and generally clean up
this URL"""
if isinstance (url, URL):
# This branch is currently never taken, we get always called
# with a string as 'url'
if base is not None:
# FIXME!! Does this make sense at all? URLs should always be
# absoulte, so giving a base is moot...
result = Url (url, base).as_string (with_fragment=1)
else:
result = url.as_string (with_fragment=1)
else:
url = string.strip (str (url))
# If we don't want an invalid request error then
# replace spaces chars " " with "%20" (url encoded space)
url = string.replace(str (url)," ","%20")
url = URL (url, base)
result = url.as_string (with_fragment=1)
return result
