Re: utf-8 encoding

Patrick Ohly Wed, 02 Apr 2003 03:39:51 -0800

On Tue, 2003-02-25 at 10:07, Patrick Ohly wrote:
[current state of encoding support in Python plucker:]
> - characters with utf-8 encoding are copied verbatim to the pdb file
> - the encoding of the pdb is set to utf-8 correctly
> - the V1.2 plucker viewer cannot handle utf-8 and interprets the
>   byte sequence C3 A4 (a umlaut) as A tilde and Euro, as if it
>   was Latin encoding


Michael Nordstr�m has confirmed this and suggested that the parser
should convert into the character set supported by the viewer,
instead of adding support for all kinds of character sets to the
viewer.

Here's a patch that implements this. At the moment, the .pdb will always
use latin-1 encoding. This could be made configurable and not setting
the output charset should not change the behaviour of the parser.
For Japanese devices it might be necessary to really make
it configurable, otherwise I see no reason to not make
latin-1 the default.

I also sent the patch to Michael and Bill Janssen, but haven't heard
back from them, so I post the patch here just in case someone
runs into the same problem and needs a solution.

-- 
Freundliche Gruesse / Best Regards

Patrick Ohly
Senior Software Engineer
--------------------------------------------------------------------
//// pallas 
Pallas GmbH / Hermuelheimer Str. 10 / 50321 Bruehl / Germany
[EMAIL PROTECTED] / www.pallas.com
Tel +49-2232-1896-30 / Fax +49-2232-1896-29
--------------------------------------------------------------------

? __init__.py
Index: TextParser.py
===================================================================
RCS file: /cvs/plucker/plucker_src/parser/python/PyPlucker/TextParser.py,v
retrieving revision 1.52
diff -c -r1.52 TextParser.py
*** TextParser.py       9 Feb 2003 22:20:43 -0000       1.52
--- TextParser.py       21 Mar 2003 10:50:24 -0000
***************
*** 53,59 ****
  from PyPlucker import PluckerDocs
  from PyPlucker import Url
  from PyPlucker import DEFAULT_LOCALE_CHARSET_ENCODING
! from PyPlucker.helper.CharsetMapping import charset_name_to_mibenum
  from PyPlucker import UtilFns
  
  message = UtilFns.message
--- 53,59 ----
  from PyPlucker import PluckerDocs
  from PyPlucker import Url
  from PyPlucker import DEFAULT_LOCALE_CHARSET_ENCODING
! from PyPlucker.helper.CharsetMapping import 
charset_name_to_mibenum,charset_mibenum_to_name
  from PyPlucker import UtilFns
  
  message = UtilFns.message
***************
*** 324,329 ****
--- 324,356 ----
      def __init__ (self, url, config, **keyword_args):
          self._doc = PluckerDocs.PluckerTextDocument (url)
          self._config = config
+         
+         # Desired output character set in the document.
+         # If not set, then the input character set is used without conversion
+         # (the traditional behaviour).
+         self._outcharsetname = 'ISO-8859-1';
+ 
+         # See if we can supply a default input charset.
+         # Do this as soon as possible, because any text added
+         # may have to be reencoded, which is not supported once
+         # it has been added to the document.
+         if config:
+             userspec = config.get_int('default_charset', 0)
+         else:
+             userspec = None
+         locale_default = charset_name_to_mibenum(DEFAULT_LOCALE_CHARSET_ENCODING)
+         # the userspec will take precedence
+         if userspec:
+             self.set_charset(userspec)
+         # OK, so we have no idea.  Use the HTTP default of ISO-8859-1 (4) for
+         # http: URLs, and the environment default (if any) for others
+         elif (string.lower(url[:5]) == 'http:' or string.lower(url[:6]) == 'https:'):
+             self.set_charset(4)
+         elif locale_default:
+             self.set_charset(locale_default)
+         else:
+             self.set_charset(4)
+         
          self._attributes = AttributeStack ()
          self._paragraph = PluckerDocs.PluckerTextParagraph ()
          self._is_new_paragraph = 1
***************
*** 342,348 ****
          # (which may not be black if they have hacked it with Kroma or similar 
utility) until
          # first color change, or new paragraph, then will go to black. This makes 
sure document
          # starts off in black.
!         self._color_paragraphs = config.get_bool("color_paragraphs")
          if (self._color_paragraphs):
              self._paragraph.add_set_forecolor (self._attributes.get_forecolor ())
  
--- 369,375 ----
          # (which may not be black if they have hacked it with Kroma or similar 
utility) until
          # first color change, or new paragraph, then will go to black. This makes 
sure document
          # starts off in black.
!         self._color_paragraphs = config and config.get_bool("color_paragraphs")
          if (self._color_paragraphs):
              self._paragraph.add_set_forecolor (self._attributes.get_forecolor ())
  
***************
*** 352,358 ****
  
      
      def set_charset(self, charset):
!         self._doc.set_charset(charset_name_to_mibenum(charset))
  
      def set_id_tag(self, tag):
          self._doc.register_doc(tag)
--- 379,397 ----
  
      
      def set_charset(self, charset):
!         # handle both name and MIB integer
!         if type(charset) == type(''):
!             charset = charset_name_to_mibenum(charset)
!         
!         if self._outcharsetname:
!             # use charset as input format, but the output charset
!             # in the document
!             self._incharsetname = charset_mibenum_to_name(charset)
!             self._doc.set_charset(charset_name_to_mibenum(self._outcharsetname))
!         else:
!             # traditional behaviour: create document with same
!             # charset as input document
!             self._doc.set_charset(charset)
  
      def set_id_tag(self, tag):
          self._doc.register_doc(tag)
***************
*** 369,391 ****
              self._doc.add_paragraph (self._paragraph)
              self._paragraph = PluckerDocs.PluckerTextParagraph ()
              self._is_new_paragraph = 1
-         if not self._doc.get_charset():
-             # see if we can supply a default charset
-             url = self._doc.get_url()
-             if self._config:
-                 userspec = self._config.get_int('default_charset', 0)
-             else:
-                 userspec = None
-             locale_default = charset_name_to_mibenum(DEFAULT_LOCALE_CHARSET_ENCODING)
-             # the userspec will take precedence
-             if userspec:
-                 self._doc.set_charset(userspec)
-             # OK, so we have no idea.  Use the HTTP default of ISO-8859-1 (4) for
-             # http: URLs, and the environment default (if any) for others
-             elif (string.lower(url[:5]) == 'http:' or string.lower(url[:6]) == 
'https:'):
-                 self._doc.set_charset(4)
-             elif locale_default:
-                 self._doc.set_charset(locale_default)
  
      def add_name (self, name):
          """Give name to the current paragraph"""
--- 408,413 ----
***************
*** 626,632 ****
                  while new_spacing > 7:
                      self._paragraph.set_extra_spacing (7)
                      new_spacing = new_spacing - 7
!                     self.add_text ("  ")
                      self._ship_paragraph ()
                  self._paragraph.set_extra_spacing (new_spacing)
          else:
--- 648,654 ----
                  while new_spacing > 7:
                      self._paragraph.set_extra_spacing (7)
                      new_spacing = new_spacing - 7
!                     self.add_text ("  ", "ISO-8859-1")
                      self._ship_paragraph ()
                  self._paragraph.set_extra_spacing (new_spacing)
          else:
***************
*** 662,669 ****
          return (first, rest)
          
  
!     def add_text (self, text):
!         """Add some text, maybe even many lines."""
          lines = string.split (text, "\n")
          for i in range (len (lines)):
              line = lines[i]
--- 684,697 ----
          return (first, rest)
          
  
!     def add_text (self, text, charsetname=None):
!         """
!         Add some text, maybe even many lines.
!         Character set of text is either the one specified explicitly,
!         or the one set with set_charset() otherwise.
!         """
! 
!         text = self.reencode_text (text, charsetname)
          lines = string.split (text, "\n")
          for i in range (len (lines)):
              line = lines[i]
***************
*** 698,703 ****
--- 726,739 ----
                  # add the newline that was left out
                  self.add_vspace (n_units=0, additional=1)
  
+     def add_cell_text (self, atable, text, charsetname=None):
+         """
+         Add some text to a table cell.
+         Character set of text is either the one specified explicitly,
+         or the one set with set_charset() otherwise.
+         """
+         atable.add_cell_text (self.reencode_text (text, charsetname))
+ 
      def add_unicode_char (self, char_code, text_alternative):
          """Add a Unicode character, along with a non-Unicode text alternative."""
          self._paragraph.add_unicode_char (char_code, text_alternative)
***************
*** 743,749 ****
          self._is_new_line = 0
          self._paragraph.add_hr (height, width, perc_width)
  
! 
  
  
  class PlainTextParser:
--- 779,807 ----
          self._is_new_line = 0
          self._paragraph.add_hr (height, width, perc_width)
  
!     def reencode_text (self, text, charsetname=None):
!         """
!         Convert character set from charsetname to self._ourcharsetname,
!         if both are given and differ. Returns the modified text.
!         """
! 
!         # convert to desired charset?
!         if self._outcharsetname:
!             if not charsetname:
!                 charsetname = self._incharsetname
!             if self._outcharsetname != charsetname:
!                 try:
!                     text = text.decode(charsetname, 'replace'). \
!                            encode(self._outcharsetname, 'replace')
!                 except AttributeError, x:
!                     if str(x) == 'decode':
!                         print "Python 2.2 is required to convert character encoding 
%s to %s." % \
!                               (charsetname, self._outcharsetname)
!                         print "Proceeding without conversion."
!                         self._outcharsetname = None
!                     else:
!                         raise
!         return text
  
  
  class PlainTextParser:
***************
*** 1079,1089 ****
      ##  Private functions follow
      ##
  
!     def _add_text (self, text):
          """Add some text.  This may contain newlines, however use
          _add_vspace() to do that explicitly if you want to."""
          if self._visible:
!             self._doc.add_text (text)
              self._element_beginning = 0
  
  
--- 1137,1147 ----
      ##  Private functions follow
      ##
  
!     def _add_text (self, text, charsetname=None):
          """Add some text.  This may contain newlines, however use
          _add_vspace() to do that explicitly if you want to."""
          if self._visible:
!             self._doc.add_text (text, charsetname)
              self._element_beginning = 0
  
  
***************
*** 1243,1249 ****
          pass
  
      
!     def handle_data (self, data):
          if self._clean_whitespace[-1]:
              data = _RE_WHITESPACE.sub(" ", data, 0)   # data = re.sub("[\n\r\f 
\t]+", " ", data)
          else:
--- 1301,1307 ----
          pass
  
      
!     def handle_data (self, data, charsetname=None):
          if self._clean_whitespace[-1]:
              data = _RE_WHITESPACE.sub(" ", data, 0)   # data = re.sub("[\n\r\f 
\t]+", " ", data)
          else:
***************
*** 1297,1305 ****
                          style_str = struct.pack (">BB", 0, 0x78)
                      self.atable.add_cell_text(style_str)
                      self.last_table_strike = new_strike
!                 self.atable.add_cell_text (data)
              else:
!                 self._add_text (data)
  
  
      def start_body (self, attributes):
--- 1355,1363 ----
                          style_str = struct.pack (">BB", 0, 0x78)
                      self.atable.add_cell_text(style_str)
                      self.last_table_strike = new_strike
!                 self._doc.add_cell_text (self.atable, data, charsetname)
              else:
!                 self._add_text (data, charsetname)
  
  
      def start_body (self, attributes):
***************
*** 1629,1635 ****
      def do_p (self, attributes):
          if self._needs_newpara ():
              if self._indent_paragraphs:
!                 self._add_text('\xa0\xa0\xa0\xa0\xa0\xa0')
              else:
                  self._add_vspace (2)
  
--- 1687,1693 ----
      def do_p (self, attributes):
          if self._needs_newpara ():
              if self._indent_paragraphs:
!                 self._add_text('\xa0\xa0\xa0\xa0\xa0\xa0', 'ISO-8859-1')
              else:
                  self._add_vspace (2)
  
***************
*** 2040,2046 ****
      def start_q (self, attr):
          self._add_vspace (2)
          self._doc.indent (5, 5)
!         self._add_text ("  ``")
          self._doc.indent (7, 7)
   
  
--- 2098,2104 ----
      def start_q (self, attr):
          self._add_vspace (2)
          self._doc.indent (5, 5)
!         self._add_text ("  ``", "ISO-8859-1")
          self._doc.indent (7, 7)
   
  
***************
*** 2049,2055 ****
          # the closing quotes can be right of the quoted block
          # but not left of them
          self._doc.indent (0, -12)
!         self._add_text ("")
          self._doc.indent (-12, 0)
          self._add_vspace (2)
  
--- 2107,2113 ----
          # the closing quotes can be right of the quoted block
          # but not left of them
          self._doc.indent (0, -12)
!         self._add_text ("", "ISO-8859-1")
          self._doc.indent (-12, 0)
          self._add_vspace (2)
  
***************
*** 2127,2133 ****
                  if len(s) == 1:
                      val = ord(s)
                      if (val >= 0xa0 and val < 0x100) or (val >= 0x00 and val < 0xFF):
!                         self.handle_data (s)
                      else:
                          self._add_unicode_char(val, "&#%d;" % val)
                  else:
--- 2185,2191 ----
                  if len(s) == 1:
                      val = ord(s)
                      if (val >= 0xa0 and val < 0x100) or (val >= 0x00 and val < 0xFF):
!                         self.handle_data (s, 'ISO-8859-1')
                      else:
                          self._add_unicode_char(val, "&#%d;" % val)
                  else:
***************
*** 2136,2142 ****
                          self.unknown_charref(m.group(1))
              else:
                  self._unknown["entityref-%s"%ref] = 1
!                 self.handle_data('?')
  
  
  
--- 2194,2200 ----
                          self.unknown_charref(m.group(1))
              else:
                  self._unknown["entityref-%s"%ref] = 1
!                 self.handle_data('?', 'ISO-8859-1')

Re: utf-8 encoding

Reply via email to