Re: Reducing iconv-induced memory usage

Ludovic Courtès Tue, 26 Apr 2011 15:41:28 -0700

Hi!

So, here’s the patch.


It also makes UTF-8 input ~30% faster according to ports.bm (which
doesn’t benchmark output):

  * before:

    ("ports.bm: peek-char: latin-1 port" 700000 user 0.36)
    ("ports.bm: peek-char: utf-8 port, ascii character" 700000 user 0.35)
    ("ports.bm: peek-char: utf-8 port, Korean character" 700000 user 0.61)
    ("ports.bm: read-char: latin-1 port" 10000000 user 3.32)
    ("ports.bm: read-char: utf-8 port, ascii character" 10000000 user 3.33)
    ("ports.bm: read-char: utf-8 port, Korean character" 10000000 user 3.39)
    ("ports.bm: char-ready?: latin-1 port" 10000000 user 2.95)
    ("ports.bm: char-ready?: utf-8 port, ascii character" 10000000 user 2.96)
    ("ports.bm: char-ready?: utf-8 port, Korean character" 10000000 user 3.01)
    ("ports.bm: rdelim: read-line" 1000 user 3.1)

  * after:

    ("ports.bm: peek-char: latin-1 port" 700000 user 0.31)
    ("ports.bm: peek-char: utf-8 port, ascii character" 700000 user 0.24)
    ("ports.bm: peek-char: utf-8 port, Korean character" 700000 user 0.3)
    ("ports.bm: read-char: latin-1 port" 10000000 user 2.73)
    ("ports.bm: read-char: utf-8 port, ascii character" 10000000 user 3.38)
    ("ports.bm: read-char: utf-8 port, Korean character" 10000000 user 3.37)
    ("ports.bm: char-ready?: latin-1 port" 10000000 user 2.42)
    ("ports.bm: char-ready?: utf-8 port, ascii character" 10000000 user 2.41)
    ("ports.bm: char-ready?: utf-8 port, Korean character" 10000000 user 2.43)
    ("ports.bm: rdelim: read-line" 1000 user 1.91)

Comments?  OK to apply?

Thanks,
Ludo’.

diff --git a/libguile/ports.c b/libguile/ports.c
index 6e0ae6c..d728356 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1057,6 +1057,7 @@ update_port_lf (scm_t_wchar c, SCM port)
   switch (c)
     {
     case '\a':
+    case EOF:
       break;
     case '\b':
       SCM_DECCOL (port);
@@ -1115,23 +1116,113 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
   return codepoint;
 }
 
-/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
-   with the byte representation of the codepoint in PORT's encoding, and
-   set *LEN to the length in bytes of that representation.  Return 0 on
-   success and an errno value on error.  */
+/* Read a UTF-8 sequence from PORT.  On success, return 0 and set
+   *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
+   representation, and set *LEN to the length in bytes.  Return
+   `EILSEQ' on error.  */
 static int
-get_codepoint (SCM port, scm_t_wchar *codepoint,
-	       char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
+		    scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+  int byte;
+
+  *len = 0;
+
+  byte = scm_get_byte_or_eof (port);
+  if (byte == EOF)
+    {
+      *codepoint = EOF;
+      return 0;
+    }
+
+  buf[0] = (scm_t_uint8) byte;
+  *len = 1;
+
+  if (buf[0] <= 0x7f)
+    *codepoint = buf[0];
+  else if ((buf[0] & 0xe0) == 0xc0)
+    {
+      byte = scm_get_byte_or_eof (port);
+      if (byte == EOF || ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
+	| (buf[1] & 0x3f);
+    }
+  else if ((buf[0] & 0xf0) == 0xe0)
+    {
+      byte = scm_get_byte_or_eof (port);
+      if (byte == EOF || ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      byte = scm_get_byte_or_eof (port);
+      if (byte == EOF || ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      buf[2] = (scm_t_uint8) byte;
+      *len = 3;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
+	| ((scm_t_wchar) buf[1] & 0x3f) << 6UL
+	| (buf[2] & 0x3f);
+    }
+  else
+    {
+      byte = scm_get_byte_or_eof (port);
+      if (byte == EOF || ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      byte = scm_get_byte_or_eof (port);
+      if (byte == EOF || ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      buf[2] = (scm_t_uint8) byte;
+      *len = 3;
+
+      byte = scm_get_byte_or_eof (port);
+      if (byte == EOF || ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      buf[3] = (scm_t_uint8) byte;
+      *len = 4;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
+	| ((scm_t_wchar) buf[1] & 0x3f) << 12UL
+	| ((scm_t_wchar) buf[2] & 0x3f) << 6UL
+	| (buf[3] & 0x3f);
+    }
+
+  return 0;
+
+ invalid_seq:
+  /* Return the faulty byte.  */
+  scm_unget_byte (byte, port);
+
+  return EILSEQ;
+}
+
+/* Likewise, read a byte sequence from PORT, passing it through its
+   input conversion descriptor.  */
+static int
+get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
+		     char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
+  scm_t_port *pt;
   int err, byte_read;
   size_t bytes_consumed, output_size;
   char *output;
   scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);
 
-  if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
-    /* Initialize the conversion descriptors.  */
-    scm_i_set_port_encoding_x (port, pt->encoding);
+  pt = SCM_PTAB_ENTRY (port);
 
   for (output_size = 0, output = (char *) utf8_buf,
 	 bytes_consumed = 0, err = 0;
@@ -1174,10 +1265,44 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
 	output_size = sizeof (utf8_buf) - output_left;
     }
 
-  if (SCM_UNLIKELY (err != 0))
+
+  if (SCM_LIKELY (err == 0))
+    {
+      /* Convert the UTF8_BUF sequence to a Unicode code point.  */
+      *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+      *len = bytes_consumed;
+    }
+
+  return err;
+}
+
+/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
+   with the byte representation of the codepoint in PORT's encoding, and
+   set *LEN to the length in bytes of that representation.  Return 0 on
+   success and an errno value on error.  */
+static int
+get_codepoint (SCM port, scm_t_wchar *codepoint,
+	       char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+  int err;
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->input_cd == (iconv_t) -1)
+    /* Initialize the conversion descriptors, if needed.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  if (pt->input_cd == (iconv_t) -1)
+    err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
+  else
+    err = get_iconv_codepoint (port, codepoint, buf, len);
+
+  if (SCM_LIKELY (err == 0))
+    update_port_lf (*codepoint, port);
+  else
     {
-      /* Reset the `iconv' state.  */
-      iconv (pt->input_cd, NULL, NULL, NULL, NULL);
+      if (pt->input_cd != (iconv_t) -1)
+	/* Reset the `iconv' state.  */
+	iconv (pt->input_cd, NULL, NULL, NULL, NULL);
 
       if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
 	{
@@ -1189,14 +1314,6 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
 	 SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
 	 input encoding errors.)  */
     }
-  else
-    /* Convert the UTF8_BUF sequence to a Unicode code point.  */
-    *codepoint = utf8_to_codepoint (utf8_buf, output_size);
-
-  if (SCM_LIKELY (err == 0))
-    update_port_lf (*codepoint, port);
-
-  *len = bytes_consumed;
 
   return err;
 }
@@ -2027,28 +2144,35 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
   if (encoding == NULL)
     encoding = "ISO-8859-1";
 
-  pt->encoding = scm_gc_strdup (encoding, "port");
+  if (pt->encoding != encoding)
+    pt->encoding = scm_gc_strdup (encoding, "port");
 
-  if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+  /* If ENCODING is UTF-8, then no conversion descriptor is opened
+     because we do I/O ourselves.  This saves 100+ KiB for each
+     descriptor.  */
+  if (strcmp (encoding, "UTF-8"))
     {
-      /* Open an input iconv conversion descriptor, from ENCODING
-	 to UTF-8.  We choose UTF-8, not UTF-32, because iconv
-	 implementations can typically convert from anything to
-	 UTF-8, but not to UTF-32 (see
-	 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
-      new_input_cd = iconv_open ("UTF-8", encoding);
-      if (new_input_cd == (iconv_t) -1)
-	goto invalid_encoding;
-    }
+      if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+	{
+	  /* Open an input iconv conversion descriptor, from ENCODING
+	     to UTF-8.  We choose UTF-8, not UTF-32, because iconv
+	     implementations can typically convert from anything to
+	     UTF-8, but not to UTF-32 (see
+	     <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
+	  new_input_cd = iconv_open ("UTF-8", encoding);
+	  if (new_input_cd == (iconv_t) -1)
+	    goto invalid_encoding;
+	}
 
-  if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
-    {
-      new_output_cd = iconv_open (encoding, "UTF-8");
-      if (new_output_cd == (iconv_t) -1)
+      if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
 	{
-	  if (new_input_cd != (iconv_t) -1)
-	    iconv_close (new_input_cd);
-	  goto invalid_encoding;
+	  new_output_cd = iconv_open (encoding, "UTF-8");
+	  if (new_output_cd == (iconv_t) -1)
+	    {
+	      if (new_input_cd != (iconv_t) -1)
+		iconv_close (new_input_cd);
+	      goto invalid_encoding;
+	    }
 	}
     }
 
diff --git a/libguile/print.c b/libguile/print.c
index 1399566..d18c054 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -821,33 +821,58 @@ codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
   return len;
 }
 
-/* Display the LEN codepoints in STR to PORT according to STRATEGY;
-   return the number of codepoints successfully displayed.  If NARROW_P,
-   then STR is interpreted as a sequence of `char', denoting a Latin-1
-   string; otherwise it's interpreted as a sequence of
-   `scm_t_wchar'.  */
-static size_t
-display_string (const void *str, int narrow_p,
-		size_t len, SCM port,
-		scm_t_string_failed_conversion_handler strategy)
-
-{
 #define STR_REF(s, x)				\
   (narrow_p					\
    ? (scm_t_wchar) ((unsigned char *) (s))[x]	\
    : ((scm_t_wchar *) (s))[x])
 
+/* Write STR to PORT as UTF-8.  STR is a LEN-codepoint string; it is
+   narrow if NARROW_P is true, wide otherwise.  Return LEN.  */
+static size_t
+display_string_as_utf8 (const void *str, int narrow_p, size_t len,
+			SCM port)
+{
+  size_t printed = 0;
+
+  while (len > printed)
+    {
+      size_t utf8_len, i;
+      char *input, utf8_buf[256];
+
+      /* Convert STR to UTF-8.  */
+      for (i = printed, utf8_len = 0, input = utf8_buf;
+	   i < len && utf8_len + 4 < sizeof (utf8_buf);
+	   i++)
+	{
+	  utf8_len += codepoint_to_utf8 (STR_REF (str, i),
+					 (scm_t_uint8 *) input);
+	  input = utf8_buf + utf8_len;
+	}
+
+      /* INPUT was successfully converted, entirely; print the
+	 result.  */
+      scm_lfwrite (utf8_buf, utf8_len, port);
+      printed += i - printed;
+    }
+
+  assert (printed == len);
+
+  return len;
+}
+
+/* Convert STR through PORT's output conversion descriptor and write the
+   output to PORT.  Return the number of codepoints written.  */
+static size_t
+display_string_using_iconv (const void *str, int narrow_p, size_t len,
+			    SCM port,
+			    scm_t_string_failed_conversion_handler strategy)
+{
   size_t printed;
   scm_t_port *pt;
 
   pt = SCM_PTAB_ENTRY (port);
 
-  if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
-    /* Initialize the conversion descriptors.  */
-    scm_i_set_port_encoding_x (port, pt->encoding);
-
   printed = 0;
-
   while (len > printed)
     {
       size_t done, utf8_len, input_left, output_left, i;
@@ -880,7 +905,7 @@ display_string (const void *str, int narrow_p,
 
       if (SCM_UNLIKELY (done == (size_t) -1))
 	{
-          int errno_save = errno;
+	  int errno_save = errno;
 
 	  /* Reset the `iconv' state.  */
 	  iconv (pt->output_cd, NULL, NULL, NULL, NULL);
@@ -928,7 +953,34 @@ display_string (const void *str, int narrow_p,
     }
 
   return printed;
+}
+
 #undef STR_REF
+
+/* Display the LEN codepoints in STR to PORT according to STRATEGY;
+   return the number of codepoints successfully displayed.  If NARROW_P,
+   then STR is interpreted as a sequence of `char', denoting a Latin-1
+   string; otherwise it's interpreted as a sequence of
+   `scm_t_wchar'.  */
+static size_t
+display_string (const void *str, int narrow_p,
+		size_t len, SCM port,
+		scm_t_string_failed_conversion_handler strategy)
+
+{
+  scm_t_port *pt;
+
+  pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->output_cd == (iconv_t) -1)
+    /* Initialize the conversion descriptors, if needed.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  if (pt->output_cd == (iconv_t) -1)
+    return display_string_as_utf8 (str, narrow_p, len, port);
+  else
+    return display_string_using_cd (str, narrow_p, len,
+				    port, strategy);
 }
 
 /* Attempt to display CH to PORT according to STRATEGY.  Return non-zero
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 9d3000c..d5b1b60 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -391,7 +391,8 @@
                      (with-fluids ((%default-port-encoding e))
                        (call-with-output-string
                          (lambda (p)
-                           (display (port-encoding p) p)))))
+                           (and (string=? e (port-encoding p))
+                                (display (port-encoding p) p))))))
                    encodings)
               encodings)))
 
@@ -462,6 +463,15 @@
            (= (port-line p) 0)
            (= (port-column p) 0))))
 
+  (pass-if "peek-char [utf-16]"
+    (let ((p (with-fluids ((%default-port-encoding "UTF-16BE"))
+               (open-input-string "안녕하세요"))))
+      (and (char=? (peek-char p) #\안)
+           (char=? (peek-char p) #\안)
+           (char=? (peek-char p) #\안)
+           (= (port-line p) 0)
+           (= (port-column p) 0))))
+
   (pass-if "read-char, wrong encoding, error"
     (let ((p (open-bytevector-input-port #vu8(255 65 66 67))))
       (catch 'decoding-error

Re: Reducing iconv-induced memory usage

Reply via email to