Re: [elinks-dev] Line-drawing characters when dumping Web pages

2009-06-27 Thread Kalle Olavi Niemitalo
Kalle Olavi Niemitalo k...@iki.fi writes:

 The attached patch for elinks-0.12
 (20dfdb284f9a23742800fb5b4023bef54c6ad982) implements this, but
 I'm not sure it is the right solution, because e.g. KOI8-R also
 supports line-drawing characters so the fix should preferably
 not be specific to UTF-8.

The attached new version of the patch makes elinks --dump use
line-drawing characters in all charsets where they are available.
I think this should be applied to master after the prerequisites
have been merged from elinks-0.12.

From 0dc9ac1a83f7a7d228476a4829c4b2a55daf Mon Sep 17 00:00:00 2001
From: Kalle Olavi Niemitalo k...@iki.fi
Date: Sun, 21 Jun 2009 18:30:36 +0300
Subject: [PATCH] dump: Use box-drawing characters if available

---
 NEWS   |2 +
 src/viewer/dump/dump-specialized.h |4 +-
 src/viewer/dump/dump.c |   80 ++--
 3 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/NEWS b/NEWS
index 166539a..e135750 100644
--- a/NEWS
+++ b/NEWS
@@ -88,6 +88,8 @@ includes the changes listed under ``ELinks 0.11.6.GIT now'' 
below.
 * minor bug 1017: To work around HTTP server bugs, disable
   protocol.http.compression by default, until ELinks can report
   decompression errors or automatically retry the connection.
+* enhancement: ``elinks --dump'' uses box-drawing characters if supported
+  by the charset.
 
 Bugs that should be removed from NEWS before the 0.12.0 release:
 
diff --git a/src/viewer/dump/dump-specialized.h 
b/src/viewer/dump/dump-specialized.h
index 1afbea7..b8166f7 100644
--- a/src/viewer/dump/dump-specialized.h
+++ b/src/viewer/dump/dump-specialized.h
@@ -126,8 +126,8 @@ DUMP_FUNCTION_SPECIALIZED(struct document *document, struct 
dump_output *out)
 #endif /* DUMP_COLOR_MODE_TRUE */
 
if ((attr  SCREEN_ATTR_FRAME)
-c = 176  c  224)
-   c = frame_dumb[c - 176];
+c = FRAME_CHARS_BEGIN  c  FRAME_CHARS_END)
+   c = out-frame[c - FRAME_CHARS_BEGIN];
 
 #ifdef DUMP_CHARSET_UTF8
if (!isscreensafe_ucs(c)) c = ' ';
diff --git a/src/viewer/dump/dump.c b/src/viewer/dump/dump.c
index 295a7aa..81d0382 100644
--- a/src/viewer/dump/dump.c
+++ b/src/viewer/dump/dump.c
@@ -53,6 +53,9 @@ static int dump_redir_count = 0;
 
 #define D_BUF  65536
 
+#define FRAME_CHARS_BEGIN 0xB0
+#define FRAME_CHARS_END   0xE0
+
 /** A place where dumping functions write their output.  The data
  * first goes to the buffer in this structure.  When the buffer is
  * full enough, it is flushed to a file descriptor or to a string.  */
@@ -68,10 +71,76 @@ struct dump_output {
 * flushed, or -1.  */
int fd;
 
+   /** Mapping of SCREEN_ATTR_FRAME characters.  If the target
+* codepage is UTF-8 (which is possible only if CONFIG_UTF8 is
+* defined), then the values are UTF-32.  Otherwise, they are
+* in the target codepage, even though the type may still be
+* unicode_val_T.  */
+#ifdef CONFIG_UTF8
+   unicode_val_T frame[FRAME_CHARS_END - FRAME_CHARS_BEGIN];
+#else
+   unsigned char frame[FRAME_CHARS_END - FRAME_CHARS_BEGIN];
+#endif
+
/** Bytes waiting to be flushed.  */
unsigned char buf[D_BUF];
 };
 
+/** Mapping from CP437 box-drawing characters to simpler CP437 characters.
+ * - Map mixed light/double lines to light lines or double lines,
+ *   depending on the majority; or to light lines if even.
+ * - Map double lines to light lines.
+ * - Map light and dark shades to medium, then to full blocks.
+ * - Map half blocks to full blocks.
+ * - Otherwise map to ASCII characters.  */
+static const unsigned char frame_simplify[FRAME_CHARS_END - FRAME_CHARS_BEGIN]
+= {
+   /*-0-1-2-3-4-5-6-7 */
+   /*-8-9-A-B-C-D-E-F */
+   0xB1, 0xDB, 0xB1, '|' , '+' , 0xB4, 0xB9, 0xBF, /* 0xB0...0xB7 */
+   0xC5, 0xB4, 0xB3, 0xBF, 0xD9, 0xD9, 0xD9, '+' , /* 0xB8...0xBF */
+   '+' , '+' , '+' , '+' , '-' , '+' , 0xC3, 0xCC, /* 0xC0...0xC7 */
+   0xC0, 0xDA, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xCA, /* 0xC8...0xCF */
+   0xC1, 0xCB, 0xC2, 0xC0, 0xC0, 0xDA, 0xDA, 0xC5, /* 0xD0...0xD7 */
+   0xC5, '+' , '+' , '#' , 0xDB, 0xDB, 0xDB, 0xDB  /* 0xD8...0xDF */
+};
+
+static void
+dump_output_prepare_frame(struct dump_output *out, int to_cp)
+{
+   const int cp437 = get_cp_index(cp437);
+   int orig;
+   unsigned char subst;
+
+#ifdef CONFIG_UTF8
+   if (is_cp_utf8(to_cp)) {
+   for (orig = FRAME_CHARS_BEGIN; orig  FRAME_CHARS_END; orig++)
+   out-frame[orig - FRAME_CHARS_BEGIN]
+   = cp2u(cp437, orig);
+   return;
+   }
+#endif /* CONFIG_UTF8 */
+
+   for (orig = FRAME_CHARS_BEGIN; orig  FRAME_CHARS_END; orig++) {
+   for (subst = orig;
+   

Re: [elinks-dev] Line-drawing characters when dumping Web pages

2009-06-08 Thread Kalle Olavi Niemitalo
(moved from elinks-users to elinks-dev because of the patch)

Karl Ove Hufthammer k...@huftis.org writes:

 When I use ELinks interactively, table borders are drawn using nice 
 line-drawing characters. However, when I use ‘links --dump’, these are 
 replaced by ugly -, | and + ASCII characters, even if I dump to UTF-8. 
 Is there a way to retain the nice borders when dumping a Web page?

Not at the moment.  The attached patch for elinks-0.12
(20dfdb284f9a23742800fb5b4023bef54c6ad982) implements this, but
I'm not sure it is the right solution, because e.g. KOI8-R also
supports line-drawing characters so the fix should preferably
not be specific to UTF-8.  Comments?

From 827a77a6e5fad1f4dc69909090bf07fb7b84ee51 Mon Sep 17 00:00:00 2001
From: Kalle Olavi Niemitalo k...@iki.fi
Date: Tue, 9 Jun 2009 01:48:42 +0300
Subject: [PATCH] Line-drawing characters in UTF-8 dumps

When dumping the document to a file, ELinks used to represent lines in
tables and HR elements as ASCII -+| characters.  Now, if the output
charset is UTF-8, it uses Unicode line-drawing characters instead.
This change affects elinks --dump and File - Save formatted document,
but not the Lua current_document_formatted function.
---
 NEWS   |2 +
 src/terminal/screen.c  |2 +-
 src/terminal/terminal.h|1 +
 src/viewer/dump/dump-specialized.h |   39 +++
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/NEWS b/NEWS
index a84f3f9..f407c7c 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,8 @@ includes the changes listed under ``ELinks 0.11.6.GIT now'' 
below.
 * minor bug 1017: To work around HTTP server bugs, disable
   protocol.http.compression by default, until ELinks can report
   decompression errors or automatically retry the connection.
+* enhancement: ``--dump'' and ``Save formatted document'' output
+  line-drawing characters if using UTF-8.
 
 ELinks 0.12pre4:
 
diff --git a/src/terminal/screen.c b/src/terminal/screen.c
index 8f838a6..34c93d8 100644
--- a/src/terminal/screen.c
+++ b/src/terminal/screen.c
@@ -41,7 +41,7 @@ static const unsigned char frame_vt100[48] =  
aaaxuuukkuxkjjjkmvwtqnttmlvwtqnvvw
  * characters encoded in CP437.
  * When UTF-8 I/O is enabled, ELinks uses this array instead of
  * ::frame_vt100[], and converts the characters from CP437 to UTF-8.  */
-static const unsigned char frame_vt100_u[48] = {
+const unsigned char frame_vt100_u[48] = {
177, 177, 177, 179, 180, 180, 180, 191,
191, 180, 179, 191, 217, 217, 217, 191,
192, 193, 194, 195, 196, 197, 195, 195,
diff --git a/src/terminal/terminal.h b/src/terminal/terminal.h
index c2c1d79..3bf9d19 100644
--- a/src/terminal/terminal.h
+++ b/src/terminal/terminal.h
@@ -166,6 +166,7 @@ extern LIST_OF(struct terminal) terminals;
 
 
 extern const unsigned char frame_dumb[];
+extern const unsigned char frame_vt100_u[];
 
 struct terminal *init_term(int, int);
 void destroy_terminal(struct terminal *);
diff --git a/src/viewer/dump/dump-specialized.h 
b/src/viewer/dump/dump-specialized.h
index f60aeed..6d21839 100644
--- a/src/viewer/dump/dump-specialized.h
+++ b/src/viewer/dump/dump-specialized.h
@@ -41,6 +41,9 @@ DUMP_FUNCTION_SPECIALIZED(struct document *document, int fd,
unsigned char *background = color[3];
int width = get_opt_int(document.dump.width);
 #endif /* DUMP_COLOR_MODE_TRUE */
+#ifdef DUMP_CHARSET_UTF8
+   const int cp437 = get_cp_index(cp437);
+#endif
 
for (y = 0; y  document-height; y++) {
int white = 0;
@@ -105,23 +108,11 @@ DUMP_FUNCTION_SPECIALIZED(struct document *document, int 
fd,
 
c = document-data[y].chars[x].data;
 
+#ifndef DUMP_CHARSET_UTF8
if ((attr  SCREEN_ATTR_FRAME)
 c = 176  c  224)
c = frame_dumb[c - 176];
-#ifdef DUMP_CHARSET_UTF8
-   else {
-   unsigned char *utf8_buf = encode_utf8(c);
-
-   while (*utf8_buf) {
-   if (write_char(*utf8_buf++,
-   fd, buf, bptr)) return -1;
-   }
-
-   x += unicode_to_cell(c) - 1;
-
-   continue;
-   }
-#endif /* DUMP_CHARSET_UTF8 */
+#endif /* !DUMP_CHARSET_UTF8 */
 
if (c = ' ') {
/* Count spaces. */
@@ -136,10 +127,30 @@ DUMP_FUNCTION_SPECIALIZED(struct document *document, int 
fd,
white--;
}
 
+#ifdef DUMP_CHARSET_UTF8
+   if ((attr  SCREEN_ATTR_FRAME)
+c = 176  c  224)
+   c = cp2u(cp437, frame_vt100_u[c - 176]);
+
+   {
+