Hello
This patch adds support for different encodings to python parser.
Two options are added, --input-charset and --output-charset,
first one tells the parser to assume input documents (such as
URLs) to be in given encoding, _if it cannot be determined
automatically_ (so it does not override information in
http headers). Second one is encoding of plucked *.pdb document.
This absolutely requires >python2.0, because of encoding and unicode
stuff (maybe it would work with python1.6, if it is necessary,
I can make a version that would run on python1.5, with fallback to
no recoding).
Also, the parser would crash if --input-charset and --output-charset
are incompatible (good for debugging, should be changed in future
release versions).
There is a special value for output encoding, --output-charset=unicode,
that tells the parser to include non-ascii characters as unicode values.
In order for this to work, there is a tiny patch for viewer, but notice
it allows only ISO-8859-1 unicode characters to be displayed for now
(and you have to compile plucker without IMODE support)
So to summarize:
To get ISO-8859-2 page in ISO-8859-2 encoding, do
plucker-build --input-charset=ISO8859_2 --output-charset=ISO8859_2 -f ... http://...
(you can omit --input-charset if the page has correct headers)
To do the same, but the result should be in CP1250:
plucker-build --input-charset=ISO8859_2 --output-charset=charset1250 -f ... http://...
To encode text file in KOI8-R encoding:
plucker-build --input-charset=KOI8_R --output-charset=KOI8_R -f ... file.txt
If your locale happens to be KOI8_R, it will be used to guess input charset, so
you need just to do:
plucker-build --output-charset=KOI8_R -f ... file.txt
To use the unicode feature (only for pages in ISO8859_1 encoding for now):
plucker-build --input-charset=ISO8859_1 --output-charset=unicode -f ... http://...
More to come.
--
-----------------------------------------------------------
| Radovan Garab�k http://melkor.dnp.fmph.uniba.sk/~garabik/ |
| __..--^^^--..__ garabik @ melkor.dnp.fmph.uniba.sk |
-----------------------------------------------------------
Antivirus alert: file .signature infected by signature virus.
Hi! I'm a signature virus! Copy me into your signature file to help me spread!
Only in plucker-new/: aclocal.m4
Only in plucker-new/: configure
diff -ur plucker/parser/python/PyPlucker/helper/CharsetMapping.py
plucker-new/parser/python/PyPlucker/helper/CharsetMapping.py
--- plucker/parser/python/PyPlucker/helper/CharsetMapping.py 2002-05-18
12:28:24.000000000 +0200
+++ plucker-new/parser/python/PyPlucker/helper/CharsetMapping.py 2004-02-25
23:25:37.000000000 +0100
@@ -1084,7 +1084,6 @@
def charset_known_names ():
return NamedCharsets.keys()
-
if __name__ == '__main__':
#Here could be add some "name" and "MIBenum" pairs thats not in the list.
diff -ur plucker/parser/python/PyPlucker/Spider.py
plucker-new/parser/python/PyPlucker/Spider.py
--- plucker/parser/python/PyPlucker/Spider.py 2004-02-02 03:31:58.000000000 +0100
+++ plucker-new/parser/python/PyPlucker/Spider.py 2004-02-26 17:49:00.000000000
+0100
@@ -1278,6 +1278,10 @@
message(0, " Set or clear the beamable bit in the output
file.")
message(0, " --charset=<name>:")
message(0, " Set the default charset to that specified by
<name>.")
+ message(0, " --output-charset=<name>:")
+ message(0, " Set the output charset of generated document
to that specified by <name>.")
+ message(0, " --input-charset=<name>:")
+ message(0, " Assume input charset to that specified by
<name>.")
message(0, " --owner-id=<name>:")
message(0, " Set owner-id of the output document to
<name>.")
message(0, " --url-pattern=<regexp-pattern>:")
@@ -1351,6 +1355,8 @@
copy_protect = None
iconfile = None
default_charset = None
+ output_charset = None
+ input_charset = None
owner_id = None
url_pattern = None
referrer = None
@@ -1376,7 +1382,7 @@
"maxheight=", "maxwidth=", "alt-maxheight=",
"alt-maxwidth=",
"compression=", "home-url=", "update-cache",
"launchable",
"not-launchable", "backup", "no-backup",
"beamable", "not-beamable",
- "icon=", "charset=", "owner-id=",
"url-pattern=", "referrer=",
+ "icon=", "charset=", "output-charset=",
"input-charset=", "owner-id=", "url-pattern=", "referrer=",
"user-agent=", "title=", "author=",
"status-file=", "version",
"tables", "depth-first", "http-proxy=",
"http-proxy-user=", "http-proxy-pass=",
"fragments=", "creator-id="])
@@ -1496,6 +1502,10 @@
iconfile = arg
elif opt == "--charset":
default_charset = arg
+ elif opt == "--output-charset":
+ output_charset = arg
+ elif opt == "--input-charset":
+ input_charset = arg
elif opt == "--owner-id":
owner_id = arg
elif opt == "--referrer":
@@ -1602,7 +1612,15 @@
if zlib_compression == 'false':
message('Specification of an owner-id forces use of zlib compression...')
zlib_compression = 'true'
-
+
+ if output_charset is None:
+ output_charset = 'ascii'
+ config.set ('output_charset', output_charset)
+
+ if input_charset is None:
+ input_charset = 'utf-8'
+ config.set ('input_charset', input_charset)
+
mibenum = None
# if not specified on command line, look in .pluckerrc
if default_charset is None:
diff -ur plucker/parser/python/PyPlucker/TextParser.py
plucker-new/parser/python/PyPlucker/TextParser.py
--- plucker/parser/python/PyPlucker/TextParser.py 2004-02-18 03:25:49.000000000
+0100
+++ plucker-new/parser/python/PyPlucker/TextParser.py 2004-02-26 17:50:42.000000000
+0100
@@ -525,12 +525,23 @@
return self._tags[self._stack[-1]]
+def convertUnicodeToPlucker(text, outcharset):
+ """Converts unicode string into desired output encoding,
+ or leaves it as unicode if output encoding has special value 'unicode'"""
+ if outcharset!="unicode": # special value, use unicode characters in plucked
document
+ # otherwise convert to specified encoding and use it as
+ # a raw string
+ message(4, "Converting from Unicode string to "+outcharset)
+ text = text.encode(outcharset)
+ return text
+
class TextDocBuilder:
"""Encapsulate the knowledge of when to change styles, add paragraphs, etc."""
def __init__ (self, url, config, **keyword_args):
+ message(2,"initializing textdocbuilder")
self._doc = PluckerDocs.PluckerTextDocument (url)
self._config = config
self._attributes = AttributeStack ()
@@ -875,7 +886,24 @@
def add_text (self, text):
- """Add some text, maybe even many lines."""
+ """Add some text, maybe even many lines.
+ Text can be either a string or a unicode string.
+ """
+
+ def add_unicode_text(paragraph, text):
+ if type(text)==type(""): # non-unicode string, shortcut
+ message(4, "Adding 8-bit text")
+ paragraph.add_text(text)
+ elif type(text)==type(u""):
+ message(4, "Adding Unicode text")
+ for c in text:
+ if ord(c)<128:
+ paragraph.add_text(str(c))
+ else:
+ paragraph.add_unicode_char(ord(c), "?")
+ else:
+ raise "Unexpected text type"
+
lines = string.split (text, "\n")
for i in range (len (lines)):
line = lines[i]
@@ -891,7 +919,7 @@
if rest_size < 0:
rest_size = 0
(first, rest) = self._find_text_split (line, rest_size)
- self._paragraph.add_text (first)
+ add_unicode_text(self._paragraph, first)
self._approximate_size = self._approximate_size + len (first)
self._is_new_paragraph = 0
self._is_new_line = 0
@@ -901,7 +929,7 @@
break
if line:
- self._paragraph.add_text (line)
+ add_unicode_text(self._paragraph, line)
self._approximate_size = self._approximate_size + len (line)
self._is_new_paragraph = 0
self._is_new_line = 0
@@ -963,12 +991,17 @@
def __init__ (self, url, text, headers, config, attribs):
text = _clean_newlines (text)
+ textcharset = config.get_string("input_charset")
# This we use to build the document
self._doc = TextDocBuilder (url, config)
if headers.has_key("charset"):
- self._doc.set_charset (headers["charset"])
+ textcharset = headers["charset"]
+ self._doc.set_charset (textcharser)
elif attribs.has_key("charset"):
- self._doc.set_charset (attribs["charset"])
+ textcharset = attribs["charset"]
+ self._doc.set_charset (textcharset)
+ text = unicode(text, textcharset)
+ message(4, "PlainTextParser: converting into UCS2 from "+textcharset)
self._url = url
self._text = text
# In these two lists we store tuples of (url, attributes) for encountered
anchors
@@ -976,6 +1009,8 @@
self._anchors = []
self._images = []
+ text = convertUnicodeToPlucker(text, config.get_string("output_charset"))
+
self._doc.add_text (text)
self._doc.close ()
@@ -1063,6 +1098,12 @@
self._charset = headers.has_key('charset') and
charset_name_to_mibenum(headers['charset'])
if self._charset:
self._doc.set_charset(headers['charset'])
+ # charset (python name of it) of current document - first: default
+ self.html_charset = config.get_string("input_charset")
+ # second: from headers
+ if headers.has_key('charset'):
+ self.html_charset = headers['charset']
+ message(4, "Setting html charset to "+self.html_charset)
# Since some users are really stupid and use HTML wrong, we need a
# stack of these values
self._visibility_stack = []
@@ -1299,6 +1340,9 @@
"""Add some text. This may contain newlines, however use
_add_vspace() to do that explicitly if you want to."""
if self._visible:
+ message(4, "StructuredHTMLParser: converting into Unicode from
"+self.html_charset)
+ text = unicode(text, self.html_charset)
+ text = convertUnicodeToPlucker(text,
self._config.get_string("output_charset"))
if self.atable is not None and self.in_cell:
self.atable.add_cell_text (text)
else:
Only in plucker-new/viewer: aclocal.m4
Only in plucker-new/viewer: config.h.in
Only in plucker-new/viewer: configure
diff -ur plucker/viewer/os.c plucker-new/viewer/os.c
--- plucker/viewer/os.c 2004-01-04 13:02:09.000000000 +0100
+++ plucker-new/viewer/os.c 2004-02-25 23:31:15.000000000 +0100
@@ -371,16 +371,22 @@
if ( charEncoding != charEncodingPalmLatin )
return 0;
+
entries = sizeof(Latin1Mapping)/sizeof(CharMapping);
for ( i = 0 ; i < entries; i++ ) {
if ( Latin1Mapping [ i ].unicodeValue == 0 )
return 0;
- else if ( charValue < Latin1Mapping [ i ].unicodeValue )
+/* else if ( charValue < Latin1Mapping [ i ].unicodeValue )
return 0;
+*/
else if ( Latin1Mapping [ i ].unicodeValue == charValue )
return Latin1Mapping[ i ].palmCharValue;
}
+
+ if (charValue <= 255)
+ return charValue;
+
return 0;
}
Only in plucker/viewer: sony_sdk