Update of /cvsroot/freevo/freevo/lib/pywebinfo/src
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv8997
Added Files:
__init__.py config.py grabber.py grabberitem.py httpreader.py
Log Message:
New module for grabbing/scraping information from the web. Contains own
httplib implementation for keeping the notifier alive.
--- NEW FILE: config.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# config.py - Basic configuration for some utils used in Freevo
# -----------------------------------------------------------------------------
# $Id: config.py,v 1.1 2005/04/15 20:56:07 vifredr Exp $
#
# Most modules in the util directory doesn't need Freevo. So it is possible
# to use them in a different project (licence is GPL). But the modules need
# a very simple configuration, like were to store cache files and some need
# the virtual filesystem (util.vfs) to work and this has it's own data
# directory. A different problem is the python encoding handling. It is
# fixed to 'ascii' (you can change it in site.py, but that doesn't always
# work and is a bad solution).
#
# This module provides some basic settings to solve that problems. It reads
# a config file and stores everything in a struct. It will create necessary
# directories for vfs and caching and provides helper functions Unicode
# and String to solve the encoding problem.
#
# If you want to use an util module in a different project, you may also
# need this file. The name freevo in the config file can be changed at the
# beginning of this file.
#
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Dirk Meyer <[EMAIL PROTECTED]>
# Maintainer: Dirk Meyer <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------
# Application name. This name is used to locate the config file.
# Possible locations are ., ~/application, /etc/application and
# /usr/local/etc/application. The name of the config file is
# application.conf. It not defined, the variables vfs, encoding and
# cachedir are added to the resulting struct to have all needed informations.
application = 'pywebinfo'
# That's it, you shouldn't need to make changes after this point
__all__ = [ 'CONF', 'Unicode', 'String' ]
# Python imports
import os
import sys
import locale
import __builtin__
# Dummy class for the CONF
class struct:
pass
CONF = struct()
# find the currect encoding
try:
CONF.default_encoding = locale.getdefaultlocale()[1]
''.encode(CONF.default_encoding)
except:
CONF.default_encoding = 'latin-1'
CONF.encoding = CONF.default_encoding
# add everything in CONF to the module variable list (but in upper
# case, so CONF.vfs_dir is VFS_DIR, too
for key in CONF.__dict__:
exec('%s = CONF.%s' % (key.upper(), key))
# encoding helper functions
def Unicode(string, encoding=None):
"""
Convert an object to unicode using the sysconfig encoding as
fallback instead of ascii
"""
if not encoding:
encoding = CONF.encoding
if string.__class__ == str:
try:
return unicode(string, encoding)
except UnicodeDecodeError:
pass
try:
return unicode(string, CONF.default_encoding)
except UnicodeDecodeError:
pass
try:
return unicode(string, 'UTF-8')
except UnicodeDecodeError:
pass
return unicode(string, encoding, 'replace')
if string.__class__ == unicode:
return string
return Unicode(str(string), encoding)
def String(string, encoding=None):
"""
Convert an object to string using the sysconfig encoding as
fallback instead of ascii
"""
if not encoding:
encoding = CONF.encoding
if string.__class__ == unicode:
return string.encode(encoding, 'replace')
if string.__class__ == str:
return Unicode(string, encoding).encode(encoding, 'replace')
try:
return str(string)
except:
return unicode(string).encode(encoding, 'replace')
# add Unicode and String to the global scope
__builtin__.__dict__['Unicode'] = Unicode
__builtin__.__dict__['String'] = String
--- NEW FILE: httpreader.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# httpreader.py - http implementation keeping the notifier alive.
# -----------------------------------------------------------------------------
# $Id: httpreader.py,v 1.1 2005/04/15 20:56:08 vifredr Exp $
#
# Todo: Add support for SSL, IPv6, Auth?
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Viggo Fredriksen <[EMAIL PROTECTED]>
# Maintainer: Viggo Fredriksen <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------
# python modules
import re
import os
import socket
from errno import EALREADY, EINPROGRESS, EWOULDBLOCK, EISCONN
from cStringIO import StringIO
import logging
log = logging.getLogger('pywebinfo')
# notifier to keep main loop alive
import notifier
if notifier.loop == None:
notifier.init()
# socket timeouts
try:
import timeoutsocket
timeoutsocket.setDefaultSocketTimeout(1)
except ImportError:
if hasattr(socket, 'setdefaulttimeout'):
socket.setdefaulttimeout(1)
class HTTPReader:
"""
This class fetches documents with HTTP. This is done by
using callbacks to the registered handler. The handler must
support the following callbacks:
- handle_progress(url, bytes_fetched_since_last, bytes_total_length):
Makes it possible to do progress calculations.
- handle_header(url, header):
Makes it possible to do stuff with the header if nec.
- handle_line(url, line):
One line of data returned from socket of the body.
- handle_finished(url):
A request was completed. If an error occurs during the
transfer, this will also be called.
- handle_error(url, reason):
A request failed to url with error reason.
Creation of the class was motivated by the fact that I could
not find a proper way of using sockets for the notifier with
the existing python libs. Both urllib, urllib2 and httplib
seems to do their own internal caching, and can not guarantee
proper behaviour of their file-objects. This could lead to
some nasty blocking issues for Freevo.
@param url : url to fetch
@param handler : handler for callbacks
@param language: agent language
"""
# regular expressions
m_chunk = re.compile('^([\dA-Fa-f]+).*$').match
m_url = re.compile('http:/+([^/:]*):*([^/]*)(.*)').match
# connection info
length = None
connected = False
def __init__(self, url, handler, language='en-US'):
self.url = url
# extract url info
match_url = self.m_url(url)
if not match_url:
# invalid url
self.__fail('URL not supported or invalid')
else:
host = match_url.group(1)
port = match_url.group(2)
uri = match_url.group(3)
if port == '':
# default to port 80
port = 80
else:
# port defined
port = int(port)
# chunk information
self.__chunked = False
self.__chunked_left = 0
self.__chunked_over = None
# header information
self.__header = {}
self.__header_complete = False
# handler for this connection
self.__handler = handler
# connection information
self.__address = (host, port)
self.__in = StringIO()
self.__out = ''
self.__out_pointer = 0
self.__out_length = 0
self.connected = False
self.length = None
# create the socket
self.__socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.__socket.setblocking(0)
# create a GET header
self.add_header('GET %s HTTP/1.1\r\n' % uri)
self.add_header('Host: %s:%i\r\n' % (host, port))
self.add_header('User-Agent: Freevo (pywebinfo)\r\n')
self.add_header('Accept-Language: %s\r\n' % language)
self.add_header('Connection: close\r\n')
self.header_finalize()
# add the socket to the notifier
notifier.addSocket(self.__socket, self.__write, notifier.IO_WRITE)
def add_header(self, header):
"""
Add to header
"""
self.__out += header
def header_finalize(self):
"""
Complete the header
"""
# add terminator
self.add_header('\r\n')
# set initial values
self.__out_pointer = 0
self.__out_length = len(self.__out)
def __fail(self, reason):
"""
Handle failures
"""
self.__handler.handle_progress(self.url, 1, 1)
self.__handler.handle_error(self.url, reason)
self.__cleanup()
def __cleanup(self):
"""
Clean up
"""
try:
# ensure the socket was closed.
self.__socket.close()
except:
pass
try:
# Free the string buffer
self.__in.close()
except:
pass
self.connected = False
self.__header = None
self.__in = None
self.__out = None
self.__socket = None
self.__handler = None
def __connect(self, fd):
"""
Connect to host.
"""
try:
err = fd.connect_ex(self.__address)
except socket.gaierror, e:
# address error
self.__fail('Connection failure: %s' % e[1])
return False
if err in (EINPROGRESS, EALREADY, EWOULDBLOCK):
# not yet connected
return True
elif err in (0, EISCONN):
# connected to host
self.connected = True
return True
# something wrong has happend
self.__fail('Connection failure: %s' % os.strerror(err))
return False
def __write(self, fd):
"""
Write output-buffer.
"""
if not self.connected:
# need to connect first
return self.__connect(fd)
try:
# send as much data as possible
self.__out_pointer += fd.send( self.__out[self.__out_pointer:] )
except socket.error, e:
self.__fail('Error sending to socket: %s' % e)
return False
except socket.timeout:
self.__fail('Error sending to socket: socket timeout')
return False
if self.__out_pointer == self.__out_length:
# done sending, start reading the response
notifier.addSocket(self.__socket, self.__read, notifier.IO_READ)
return False
# continue sending
return True
def __read(self, fd):
"""
read callback from notifier
"""
try:
# read data from the socket
# FIXME: This value needs tuning.
chunk = fd.recv(100)
except socket.error, e:
self.__fail('Error reading from socket: %s' % e)
return False
except socket.timeout:
self.__fail('Error reading from socket: socket timeout')
return False
# write the data to the input-buffer
self.__in.write(chunk)
self.__in.seek(0)
while 1:
# FIXME: while 1: is dangerous if something
# unexpected happens.
# read one line of data
tell = self.__in.tell()
line = self.__in.readline()
if not line.endswith('\n'):
# Incomplete read, seek back
# before readline so we don't
# miss any data.
self.__in.seek(tell)
break
if not self.__header_complete:
# Header parsing. Send header to
# handler when header is complete.
if not line.strip():
# reached end of header,
# no further reading nec.
self.__handler.handle_header(self.url, self.__header)
self.__header_complete = True
break
if ':' in line:
# header value, split line at the colon
key, value = line.split(':', 1)
self.__server_header(key, value)
elif line.startswith('HTTP'):
# http response code.
# FIXME: Handle 100 continue here!
self.__server_header('httpcode', line.split(' ')[1])
else:
# body handling, send lines to handler.
# handle chunked data properly.
if self.__chunked:
# handle chunked data
if self.__chunked_left <= 0:
# data is chunked, find the offset
m_chunk = self.m_chunk(line)
if m_chunk:
self.__chunked_left = int(m_chunk.group(1), 16)
continue
self.__chunked_left -= len(line)
if self.__chunked_over:
# add the leftover from last chunk
# to remove CRLF from previous line
line = ''.join([self.__chunked_over, line])
self.__chunked_over = None
if self.__chunked_left == 0:
self.__chunked_over = line[:-1]
continue
elif self.__chunked_left < 0:
self.__chunked_over = line[:-2]
continue
# return the line to the parser
self.__handler.handle_line(self.url, line)
# remove the extracted info
data = self.__in.read()
self.__in.seek(0)
self.__in.write(data)
self.__in.truncate()
if not chunk:
# finished reading, return remaining lines
# FIXME: verify that this is the correct sequence.
if self.__chunked_over:
# something has been left over
self.__handler.handle_line(self.url, self.__chunked_over)
self.__chunked_over = None
line = self.__in.getvalue()
if line:
# something is left in the input buffer
self.__handler.handle_line(self.url, line)
self.__socket.close()
if not self.length:
# make sure we are 100% finished
self.__handler.handle_progress(self.url, 1, 1)
self.__handler.handle_finished(self.url)
self.__cleanup()
# remove socket
return False
# callback for progress indication
self.__handler.handle_progress(self.url, len(chunk), self.length)
# more information to be read.
return True
def __server_header(self, key, value):
"""
Add a header to the request.
"""
key = key.lower()
value = value.strip()
self.__header[key] = value
if key == 'content-length':
# set the content length
self.length = int(value)
elif key == 'transfer-encoding' and value == 'chunked':
# the content is chunked
self.__chunked = True
--- NEW FILE: grabber.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# grabber.py - Template for grabbers
# -----------------------------------------------------------------------------
# $Id: grabber.py,v 1.1 2005/04/15 20:56:08 vifredr Exp $
#
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Viggo Fredriksen <[EMAIL PROTECTED]>
# Maintainer: Viggo Fredriksen <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------
# python modules
import logging
from cStringIO import StringIO
from types import StringTypes
# pywebinfo modules
from pywebinfo.httpreader import HTTPReader
log = logging.getLogger('pywebinfo')
# notifier to keep main loop alive
try:
import notifier
if notifier.loop == None:
notifier.init()
except ImportError:
notifier = None
class Grabber:
"""
Basic grabber template
"""
def __init__(self, cb_progress=None, cb_error=None, cb_result=None,
language='en-US'):
"""
@param cb_progress: A callback for reporting approximate
progress status (0-100). Should be
used for keeping the main loop alive
if using notifier.
@param cb_result: A callback for receiving the result
from the query.
@param cb_error: A callback for receiving errors in the query.
@param language: Accept-Language.
"""
self.cb_result = cb_result
self.cb_progress = cb_progress
self.cb_error = cb_error
self.language = language
self.delivered = False
self.d_progress = {}
self.bytes_total = 0
self.bytes_fetched = 0
self.bytes_progress = 0
self.__result = None
def handle_header(self, url, header):
"""
Handle the initial header
"""
pass
def handle_line(self, url, line):
"""
Handle one line of data
"""
pass
def handle_finished(self, url):
"""
Marker sent when an url has finished
processing by the httpreader
"""
pass
def handle_error(self, url, reason):
"""
Callback for error handling. Classes overriding this should
call cb_error if it exists. By default this sets results
to None and delivers this to the waiting caller.
"""
log.warning('Failed fetching %s:%s' % (url, reason))
if self.cb_error:
# deliver error to receiver
self.cb_error(url, reason)
# deliver an empty result
self.deliver_result(None)
def handle_progress(self, url, fetched=None, length=None):
"""
Calculate overall progress status.
TODO: The calculations could probably be done better.
"""
if not self.cb_progress or self.delivered:
# not interesting
return
if fetched:
self.bytes_fetched += fetched
if length and not self.d_progress.has_key(url):
self.d_progress[url] = length
self.bytes_total += length
if self.bytes_total == 0:
perc = 0
else:
perc = float(self.bytes_fetched*100) / float(self.bytes_total)
perc = min( int(perc), 100 )
self.bytes_progress = perc
# callback the percentage done.
self.cb_progress(perc)
def deliver_result(self, result):
"""
Deliver result to the registered callback
"""
self.delivered = True
if self.cb_result:
# deliver via callback
self.cb_result(result)
else:
# deliver via loop
self.__result = result
def return_result(self):
"""
Helper for methods which can return a result
when the grabber is configured to not use callback
for the result.
"""
if self.cb_result:
# This grabber uses callbacks to deliver
# results.
return
while not self.delivered:
# step until finished
notifier.step(False, False)
self.delivered = False
# return the result
return self.__result
def get_image(self, url_or_urls):
"""
Special function allowing fetching of images
from the web and printing copyright info to
the logger.
"""
if isinstance(url_or_urls, StringTypes):
url_or_urls = [url_or_urls]
_ImageGrabber(self, url_or_urls, self.language)
log.info('Downloading images from:')
log.info('\n'.join(url_or_urls))
log.info('Freevo knows nothing of the copyright')
log.info('status of this/these image(s). Refer to')
log.info('the above source(s) for more information')
log.info('about private use.')
return self.return_result()
def get_url(self, url_or_urls):
"""
This makes it possible to retrieve several
urls at once.
"""
if isinstance(url_or_urls, StringTypes):
url_or_urls = [url_or_urls]
for url in url_or_urls:
# create the readers for the urls
HTTPReader(url, self, self.language)
class _ImageGrabber:
"""
Generic image grabber, do not use directly. This is used
by the Grabber.get_image() method.
"""
def __init__(self, parent, urls, language):
self.image_data = {}
for url in urls:
self.image_data[url] = StringIO()
self.num_images = len(urls)
self.num_finished = 0
self.parent = parent
for url in urls:
# create the readers for the urls
HTTPReader(url, self, language)
def handle_header(self, url, header):
"""
Handle header
"""
pass
def handle_line(self, url, line):
"""
Handle one line of data
"""
self.image_data[url].write(line)
def handle_finished(self, url):
"""
Count finished items
"""
self.num_finished += 1
self.image_data[url].seek(0)
if self.num_images == self.num_finished:
self.parent.deliver_result(self.image_data)
def handle_progress(self, url, fetched=None, length=None):
"""
Handle progress
"""
self.parent.handle_progress(url, fetched, length)
--- NEW FILE: __init__.py ---
--- NEW FILE: grabberitem.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# grabberitem.py - Basic item from grabbers
# -----------------------------------------------------------------------------
# $Id: grabberitem.py,v 1.1 2005/04/15 20:56:08 vifredr Exp $
#
# Notes: The unicode stuff is _not_ verified!
#
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Viggo Fredriksen <[EMAIL PROTECTED]>
# Maintainer: Viggo Fredriksen <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------
import config
import copy
import htmlentitydefs
from types import StringTypes
class GrabberItem:
def __init__(self):
pass
def __getitem__(self, attr):
if self.__dict__.has_key(attr):
return self.__dict__[attr]
return None
def to_unicode(self):
"""
Set all string attributes to unicode
"""
for a in self.__dict__.keys():
if isinstance(self.__dict__[a], StringTypes):
# it's a string, convert it to unicode
self.__dict__[a] = self.htmlenties2txt(self.__dict__[a].strip())
def __str__(self):
val = []
for a in self.__dict__.keys():
if isinstance(self.__dict__[a], StringTypes):
val.append('%s: %s' % (a, self.__dict__[a]) )
return '\n'.join(val)
def htmlenties2txt(self, string):
"""
Converts a string to a string with all html entities resolved.
Returns the result as Unicode object (that may conatin chars outside 256
"""
e = copy.deepcopy(htmlentitydefs.entitydefs)
e['ndash'] = "-";
e['bull'] = "-";
e['rsquo'] = "'";
e['lsquo'] = "`";
e['hellip'] = '...'
string = Unicode(string).replace("'", "'").replace("’", "'")
i = 0
while i < len(string):
amp = string.find("&", i) # find & as start of entity
if amp == -1: # not found
break
i = amp + 1
semicolon = string.find(";", amp) # find ; as end of entity
if string[amp + 1] == "#": # numerical entity like "'"
entity = string[amp:semicolon+1]
replacement = Unicode(unichr(int(entity[2:-1])))
else:
entity = string[amp:semicolon + 1]
if semicolon - amp > 7:
continue
try:
# the array has mappings like "Uuml" -> "�"
replacement = e[entity[1:-1]]
except KeyError:
continue
string = string.replace(entity, replacement)
return string
-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now.
http://ads.osdn.com/?ad_id=6595&alloc_id=14396&op=click
_______________________________________________
Freevo-cvslog mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/freevo-cvslog