Update of /cvsroot/freevo/freevo/lib/pywebinfo/src
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv8997

Added Files:
        __init__.py config.py grabber.py grabberitem.py httpreader.py 
Log Message:
New module for grabbing/scraping information from the web. Contains own
httplib implementation for keeping the notifier alive.


--- NEW FILE: config.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# config.py - Basic configuration for some utils used in Freevo
# -----------------------------------------------------------------------------
# $Id: config.py,v 1.1 2005/04/15 20:56:07 vifredr Exp $
#
# Most modules in the util directory doesn't need Freevo. So it is possible
# to use them in a different project (licence is GPL). But the modules need
# a very simple configuration, like were to store cache files and some need
# the virtual filesystem (util.vfs) to work and this has it's own data
# directory. A different problem is the python encoding handling. It is
# fixed to 'ascii' (you can change it in site.py, but that doesn't always
# work and is a bad solution). 
#
# This module provides some basic settings to solve that problems. It reads
# a config file and stores everything in a struct. It will create necessary
# directories for vfs and caching and provides helper functions Unicode
# and String to solve the encoding problem.
#
# If you want to use an util module in a different project, you may also
# need this file. The name freevo in the config file can be changed at the
# beginning of this file.
#
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Dirk Meyer <[EMAIL PROTECTED]>
# Maintainer:    Dirk Meyer <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------

# Application name. This name is used to locate the config file.
# Possible locations are ., ~/application, /etc/application and
# /usr/local/etc/application. The name of the config file is
# application.conf. It not defined, the variables vfs, encoding and
# cachedir are added to the resulting struct to have all needed informations.
application = 'pywebinfo'

# That's it, you shouldn't need to make changes after this point

__all__ = [ 'CONF', 'Unicode', 'String' ]

# Python imports
import os
import sys
import locale
import __builtin__

# Dummy class for the CONF
class struct:
    pass

CONF = struct()

# find the currect encoding
try:
    CONF.default_encoding = locale.getdefaultlocale()[1]
    ''.encode(CONF.default_encoding)
except:
    CONF.default_encoding = 'latin-1'

CONF.encoding = CONF.default_encoding

# add everything in CONF to the module variable list (but in upper
# case, so CONF.vfs_dir is VFS_DIR, too
for key in CONF.__dict__:
    exec('%s = CONF.%s' % (key.upper(), key))

# encoding helper functions

def Unicode(string, encoding=None):
    """
    Convert an object to unicode using the sysconfig encoding as
    fallback instead of ascii
    """
    if not encoding:
        encoding = CONF.encoding
    if string.__class__ == str:
        try:
            return unicode(string, encoding)
        except UnicodeDecodeError:
            pass
        try:
            return unicode(string, CONF.default_encoding)
        except UnicodeDecodeError:
            pass
        try:
            return unicode(string, 'UTF-8')
        except UnicodeDecodeError:
            pass
        return unicode(string, encoding, 'replace')

    if string.__class__ == unicode:
        return string
    return Unicode(str(string), encoding)


def String(string, encoding=None):
    """
    Convert an object to string using the sysconfig encoding as
    fallback instead of ascii
    """
    if not encoding:
        encoding = CONF.encoding
    if string.__class__ == unicode:
        return string.encode(encoding, 'replace')
    if string.__class__ == str:
        return Unicode(string, encoding).encode(encoding, 'replace')
    try:
        return str(string)
    except:
        return unicode(string).encode(encoding, 'replace')


# add Unicode and String to the global scope
__builtin__.__dict__['Unicode'] = Unicode
__builtin__.__dict__['String']  = String

--- NEW FILE: httpreader.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# httpreader.py - http implementation keeping the notifier alive.
# -----------------------------------------------------------------------------
# $Id: httpreader.py,v 1.1 2005/04/15 20:56:08 vifredr Exp $
#
# Todo: Add support for SSL, IPv6, Auth?
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Viggo Fredriksen <[EMAIL PROTECTED]>
# Maintainer:    Viggo Fredriksen <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------

# python modules
import re
import os
import socket
from errno import EALREADY, EINPROGRESS, EWOULDBLOCK, EISCONN
from cStringIO import StringIO

import logging
log = logging.getLogger('pywebinfo')

# notifier to keep main loop alive
import notifier
if notifier.loop == None:
    notifier.init()

# socket timeouts
try:
    import timeoutsocket
    timeoutsocket.setDefaultSocketTimeout(1)
except ImportError:
    if hasattr(socket, 'setdefaulttimeout'):
        socket.setdefaulttimeout(1)


class HTTPReader:
    """
    This class fetches documents with HTTP. This is done by
    using callbacks to the registered handler. The handler must
    support the following callbacks:

      - handle_progress(url, bytes_fetched_since_last, bytes_total_length):
          Makes it possible to do progress calculations.
      - handle_header(url, header):
          Makes it possible to do stuff with the header if nec.
      - handle_line(url, line):
          One line of data returned from socket of the body.
      - handle_finished(url):
          A request was completed. If an error occurs during the
          transfer, this will also be called.
      - handle_error(url, reason):
          A request failed to url with error reason.

    Creation of the class was motivated by the fact that I could
    not find a proper way of using sockets for the notifier with
    the existing python libs. Both urllib, urllib2 and httplib
    seems to do their own internal caching, and can not guarantee
    proper behaviour of their file-objects. This could lead to
    some nasty blocking issues for Freevo.

    @param url     : url to fetch
    @param handler : handler for callbacks
    @param language: agent language
    """

    # regular expressions
    m_chunk = re.compile('^([\dA-Fa-f]+).*$').match
    m_url   = re.compile('http:/+([^/:]*):*([^/]*)(.*)').match

    # connection info    
    length     = None
    connected  = False


    def __init__(self, url, handler, language='en-US'):
        self.url = url

        # extract url info
        match_url = self.m_url(url)

        if not match_url:
            # invalid url
            self.__fail('URL not supported or invalid')
        else:
            host = match_url.group(1)
            port = match_url.group(2)
            uri  = match_url.group(3)
    
            if port == '':
                # default to port 80
                port = 80
            else:
                # port defined
                port = int(port)
    
            # chunk information
            self.__chunked      = False
            self.__chunked_left = 0
            self.__chunked_over = None
    
            # header information
            self.__header  = {}
            self.__header_complete = False
    
            # handler for this connection
            self.__handler = handler
    
            # connection information
            self.__address     = (host, port)
            self.__in          = StringIO()
            self.__out         = ''
            self.__out_pointer = 0
            self.__out_length  = 0
            self.connected     = False
            self.length        = None
   
            # create the socket
            self.__socket  = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            self.__socket.setblocking(0)
    
            # create a GET header
            self.add_header('GET %s HTTP/1.1\r\n' % uri)
            self.add_header('Host: %s:%i\r\n' % (host, port))
            self.add_header('User-Agent: Freevo (pywebinfo)\r\n')
            self.add_header('Accept-Language: %s\r\n' % language)
            self.add_header('Connection: close\r\n')
            self.header_finalize()
    
            # add the socket to the notifier
            notifier.addSocket(self.__socket, self.__write, notifier.IO_WRITE)


    def add_header(self, header):
        """
        Add to header
        """
        self.__out += header


    def header_finalize(self):
        """
        Complete the header
        """
        # add terminator
        self.add_header('\r\n')

        # set initial values
        self.__out_pointer = 0
        self.__out_length  = len(self.__out)


    def __fail(self, reason):
        """
        Handle failures
        """
        self.__handler.handle_progress(self.url, 1, 1)
        self.__handler.handle_error(self.url, reason)
        self.__cleanup()


    def __cleanup(self):
        """
        Clean up
        """
        try:
            # ensure the socket was closed.
            self.__socket.close()
        except:
            pass

        try:
            # Free the string buffer
            self.__in.close()
        except:
            pass

        self.connected = False
        self.__header  = None
        self.__in      = None
        self.__out     = None
        self.__socket  = None
        self.__handler = None


    def __connect(self, fd):
        """
        Connect to host.
        """
        try:
            err = fd.connect_ex(self.__address)
        except socket.gaierror, e:
            # address error
            self.__fail('Connection failure: %s' % e[1])
            return False

        if err in (EINPROGRESS, EALREADY, EWOULDBLOCK):
            # not yet connected
            return True

        elif err in (0, EISCONN):
            # connected to host
            self.connected = True
            return True

        # something wrong has happend
        self.__fail('Connection failure: %s' % os.strerror(err))
        return False


    def __write(self, fd):
        """
        Write output-buffer.
        """
        if not self.connected:
            # need to connect first
            return self.__connect(fd)

        try:
            # send as much data as possible
            self.__out_pointer += fd.send( self.__out[self.__out_pointer:] )
        
        except socket.error, e:
            self.__fail('Error sending to socket: %s' % e)
            return False

        except socket.timeout:
            self.__fail('Error sending to socket: socket timeout')
            return False

        if self.__out_pointer == self.__out_length:
            # done sending, start reading the response
            notifier.addSocket(self.__socket, self.__read, notifier.IO_READ)
            return False
        
        # continue sending
        return True


    def __read(self, fd):
        """
        read callback from notifier
        """
        try:
            # read data from the socket
            # FIXME: This value needs tuning.
            chunk = fd.recv(100)
        except socket.error, e:
            self.__fail('Error reading from socket: %s' % e)
            return False
        except socket.timeout:
            self.__fail('Error reading from socket: socket timeout')
            return False
        
        # write the data to the input-buffer
        self.__in.write(chunk)
        self.__in.seek(0)

        while 1:
            # FIXME: while 1: is dangerous if something
            #        unexpected happens.
            # read one line of data
            tell = self.__in.tell()
            line = self.__in.readline()

            if not line.endswith('\n'):
                # Incomplete read, seek back
                # before readline so we don't
                # miss any data.
                self.__in.seek(tell)
                break

            if not self.__header_complete:
                # Header parsing. Send header to
                # handler when header is complete.
                if not line.strip():
                    # reached end of header,
                    # no further reading nec.
                    self.__handler.handle_header(self.url, self.__header)
                    self.__header_complete = True
                    break
            
                if ':' in line:
                    # header value, split line at the colon
                    key, value = line.split(':', 1)
                    self.__server_header(key, value)

                elif line.startswith('HTTP'):
                    # http response code.
                    # FIXME: Handle 100 continue here!
                    self.__server_header('httpcode', line.split(' ')[1])
            else:
                # body handling, send lines to handler.
                # handle chunked data properly.
                if self.__chunked:
                    # handle chunked data
                    if self.__chunked_left <= 0:
                        # data is chunked, find the offset
                        m_chunk = self.m_chunk(line)
                        if m_chunk:
                            self.__chunked_left = int(m_chunk.group(1), 16)
                            continue

                    self.__chunked_left -= len(line)

                    if self.__chunked_over:
                        # add the leftover from last chunk
                        # to remove CRLF from previous line
                        line = ''.join([self.__chunked_over, line])
                        self.__chunked_over = None

                    if self.__chunked_left == 0:
                        self.__chunked_over = line[:-1]
                        continue

                    elif self.__chunked_left < 0:
                        self.__chunked_over = line[:-2]
                        continue
                
                # return the line to the parser
                self.__handler.handle_line(self.url, line)

        # remove the extracted info
        data = self.__in.read()
        self.__in.seek(0)
        self.__in.write(data)
        self.__in.truncate()

        if not chunk:
            # finished reading, return remaining lines
            # FIXME: verify that this is the correct sequence.
            if self.__chunked_over:
                # something has been left over
                self.__handler.handle_line(self.url, self.__chunked_over)
                self.__chunked_over = None

            line = self.__in.getvalue()
            if line:
                # something is left in the input buffer
                self.__handler.handle_line(self.url, line)
            
            self.__socket.close()

            if not self.length:
                # make sure we are 100% finished
                self.__handler.handle_progress(self.url, 1, 1)

            self.__handler.handle_finished(self.url)
            self.__cleanup()

            # remove socket
            return False

        # callback for progress indication
        self.__handler.handle_progress(self.url, len(chunk), self.length)

        # more information to be read.
        return True


    def __server_header(self, key, value):
        """
        Add a header to the request.
        """
        key   = key.lower()
        value = value.strip()

        self.__header[key] = value

        if key == 'content-length':
            # set the content length
            self.length = int(value)
        elif key == 'transfer-encoding' and value == 'chunked':
            # the content is chunked
            self.__chunked = True

--- NEW FILE: grabber.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# grabber.py - Template for grabbers
# -----------------------------------------------------------------------------
# $Id: grabber.py,v 1.1 2005/04/15 20:56:08 vifredr Exp $
#
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Viggo Fredriksen <[EMAIL PROTECTED]>
# Maintainer:    Viggo Fredriksen <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------

# python modules
import logging
from cStringIO import StringIO
from types import StringTypes

# pywebinfo modules
from pywebinfo.httpreader import HTTPReader

log = logging.getLogger('pywebinfo')

# notifier to keep main loop alive
try:
    import notifier
    if notifier.loop == None:
        notifier.init()
except ImportError:
    notifier = None



class Grabber:
    """
    Basic grabber template
    """
    def __init__(self, cb_progress=None, cb_error=None, cb_result=None,
                 language='en-US'):
        """
        @param cb_progress: A callback for reporting approximate
                            progress status (0-100). Should be
                            used for keeping the main loop alive
                            if using notifier.
        @param cb_result:   A callback for receiving the result
                            from the query.
        @param cb_error:    A callback for receiving errors in the query.

        @param language:    Accept-Language.
        """
        self.cb_result      = cb_result
        self.cb_progress    = cb_progress
        self.cb_error       = cb_error
        self.language       = language
        self.delivered      = False
        self.d_progress     = {}
        self.bytes_total    = 0
        self.bytes_fetched  = 0
        self.bytes_progress = 0

        self.__result       = None


    def handle_header(self, url, header):
        """
        Handle the initial header
        """
        pass
        

    def handle_line(self, url, line):
        """
        Handle one line of data
        """
        pass


    def handle_finished(self, url):
        """
        Marker sent when an url has finished
        processing by the httpreader
        """
        pass


    def handle_error(self, url, reason):
        """
        Callback for error handling. Classes overriding this should
        call cb_error if it exists. By default this sets results
        to None and delivers this to the waiting caller.
        """
        log.warning('Failed fetching %s:%s' % (url, reason))

        if self.cb_error:
            # deliver error to receiver
            self.cb_error(url, reason)

        # deliver an empty result
        self.deliver_result(None)

    def handle_progress(self, url, fetched=None, length=None):
        """
        Calculate overall progress status.
        TODO: The calculations could probably be done better.
        """
        if not self.cb_progress or self.delivered:
            # not interesting
            return

        if fetched:
            self.bytes_fetched += fetched

        if length and not self.d_progress.has_key(url):
            self.d_progress[url] = length
            self.bytes_total    += length

        if self.bytes_total == 0:
            perc = 0
        else:
            perc = float(self.bytes_fetched*100) / float(self.bytes_total)
            perc = min( int(perc), 100 )

        self.bytes_progress = perc

        # callback the percentage done.
        self.cb_progress(perc)


    def deliver_result(self, result):
        """
        Deliver result to the registered callback
        """
        self.delivered = True

        if self.cb_result:
            # deliver via callback
            self.cb_result(result)
        else:
            # deliver via loop
            self.__result = result


    def return_result(self):
        """
        Helper for methods which can return a result
        when the grabber is configured to not use callback
        for the result.
        """
        if self.cb_result:
            # This grabber uses callbacks to deliver
            # results.
            return
        
        while not self.delivered:
            # step until finished
            notifier.step(False, False)

        self.delivered = False

        # return the result
        return self.__result


    def get_image(self, url_or_urls):
        """
        Special function allowing fetching of images
        from the web and printing copyright info to
        the logger.
        """
        if isinstance(url_or_urls, StringTypes):
            url_or_urls = [url_or_urls]


        _ImageGrabber(self, url_or_urls, self.language)

        log.info('Downloading images from:')
        log.info('\n'.join(url_or_urls))
        log.info('Freevo knows nothing of the copyright')
        log.info('status of this/these image(s). Refer to')
        log.info('the above source(s) for more information')
        log.info('about private use.')

        return self.return_result()
        


    def get_url(self, url_or_urls):
        """
        This makes it possible to retrieve several
        urls at once.
        """
        if isinstance(url_or_urls, StringTypes):
            url_or_urls = [url_or_urls]

        for url in url_or_urls:
            # create the readers for the urls
            HTTPReader(url, self, self.language)


class _ImageGrabber:
    """
    Generic image grabber, do not use directly. This is used
    by the Grabber.get_image() method.
    """
    def __init__(self, parent, urls, language):
        self.image_data = {}

        for url in urls:
            self.image_data[url] = StringIO()

        self.num_images = len(urls)
        self.num_finished = 0
        self.parent = parent

        for url in urls:
            # create the readers for the urls
            HTTPReader(url, self, language)


    def handle_header(self, url, header):
        """
        Handle header
        """
        pass
        

    def handle_line(self, url, line):
        """
        Handle one line of data
        """
        self.image_data[url].write(line)


    def handle_finished(self, url):
        """
        Count finished items
        """
        self.num_finished += 1
        self.image_data[url].seek(0)

        if self.num_images == self.num_finished:
            self.parent.deliver_result(self.image_data)


    def handle_progress(self, url, fetched=None, length=None):
        """
        Handle progress
        """
        self.parent.handle_progress(url, fetched, length)
        
--- NEW FILE: __init__.py ---

--- NEW FILE: grabberitem.py ---
# -*- coding: iso-8859-1 -*-
# -----------------------------------------------------------------------------
# grabberitem.py - Basic item from grabbers
# -----------------------------------------------------------------------------
# $Id: grabberitem.py,v 1.1 2005/04/15 20:56:08 vifredr Exp $
#
# Notes: The unicode stuff is _not_ verified!
#
# -----------------------------------------------------------------------------
# Freevo - A Home Theater PC framework
# Copyright (C) 2002-2004 Krister Lagerstrom, Dirk Meyer, et al.
#
# First Edition: Viggo Fredriksen <[EMAIL PROTECTED]>
# Maintainer:    Viggo Fredriksen <[EMAIL PROTECTED]>
#
# Please see the file freevo/Docs/CREDITS for a complete list of authors.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------------

import config
import copy
import htmlentitydefs
from types import StringTypes

class GrabberItem:

    def __init__(self):
        pass

    def __getitem__(self, attr):
        if self.__dict__.has_key(attr):
            return self.__dict__[attr]

        return None


    def to_unicode(self):
        """
        Set all string attributes to unicode
        """
        for a in self.__dict__.keys():
            if isinstance(self.__dict__[a], StringTypes):
                # it's a string, convert it to unicode
                self.__dict__[a] = self.htmlenties2txt(self.__dict__[a].strip())

    def __str__(self):
        val = []
        for a in self.__dict__.keys():
            if isinstance(self.__dict__[a], StringTypes):
                val.append('%s: %s' % (a, self.__dict__[a]) )
        return '\n'.join(val)

    def htmlenties2txt(self, string):
        """
        Converts a string to a string with all html entities resolved.
        Returns the result as Unicode object (that may conatin chars outside 256
        """
        e = copy.deepcopy(htmlentitydefs.entitydefs)
        e['ndash'] = "-";
        e['bull'] = "-";
        e['rsquo'] = "'";
        e['lsquo'] = "`";
        e['hellip'] = '...'
    
        string = Unicode(string).replace("&#039", "'").replace("&#146;", "'")
    
        i = 0
        while i < len(string):
            amp = string.find("&", i) # find & as start of entity
            if amp == -1: # not found
                break
            i = amp + 1
    
            semicolon = string.find(";", amp) # find ; as end of entity
            if string[amp + 1] == "#": # numerical entity like "&#039;"
                entity = string[amp:semicolon+1]
                replacement = Unicode(unichr(int(entity[2:-1])))
            else:
                entity = string[amp:semicolon + 1]
                if semicolon - amp > 7:
                    continue
                try:
                    # the array has mappings like "Uuml" -> "�"
                    replacement = e[entity[1:-1]]
                except KeyError:
                    continue
            string = string.replace(entity, replacement)
        return string
                


-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now.
http://ads.osdn.com/?ad_id=6595&alloc_id=14396&op=click
_______________________________________________
Freevo-cvslog mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/freevo-cvslog

Reply via email to