[aspell-devel] aspell 0.60 prezip.c & compress.c improvements

Jose Da Silva Fri, 17 Sep 2004 17:25:24 -0700

Hi,
Please accept these additional fixes.
Explanations for each, below.
 thanks.


---1--prezip.c---
Speed improvement:
Not necessary to test  w[l] != '\0' if already tested p[l] != '\0' because 
the next test is for p[l] == w[l]

---2--prezip.c---
fflush needed to flush out remaining binary since there is no trailing '\n'

---3--prezip.c---
Autosense & removal of trailing CR,
Currently, compress is immune to the carriagereturn-linefeed differences 
between DOS based text and Linux/Mac/other based text lists, but prezip 
processes the carriage return if you mix lists which means DOS-based versions 
of prezip are going to create error-filled or dirty text lists
The added code should hopefully take care of differences of inputting from 
mixed DOS-based and non-DOS-based lists while still being able to work on 
wordlists that use an internal CR as a valid character.

test samples, one file with CR and one with no CR, both produce 60byte files:
prezip -z <q_cr.txt >q1.pwl
prezip -z <q_no_cr.txt >q2.pwl

diff q1.pwl q2.pwl   = no differences = what you want  :-)

prezip -d <q1.pwl >q.txt    = 84 bytes for linux-based aspell
prezip -d <q1.pwl >q.txt    = 91 bytes for DOS-based aspell


---1--compress.c---
compress.c needs additional fix:
        #define BUFSIZE 256

should become something like:
        /* BUFSIZE must be 256  to work correctly */
        #define BUFSIZE 256

...so that potential modifications don't change BUFSIZE != 256 since it will 
introduce potential errors, where:
(1) a number higher than 256 will introduce an error for word compression 
larger than 255 since the length is encoded as 1char={0...255}
(2) a number lower than 256 will be a potential problem for large words 
getting uncompressed... example BUFSIZE = 10 will have an error with a 
wordlength=20 chars long.

---other, misc.---
Canadian English spelling missing "blonde"

000002:0800
000003:0809
000004:0809
000800:0000
000801:0000
000802:0009
000803:0009

000002:0800
000003:0809
000004:0809
000800:0000
000801:0000
000802:0009
000803:0009

--- aspell-0.60/prog/prezip.c   2004-08-22 15:32:57.000000000 -0700
+++ prezip.c    2004-09-17 17:13:06.502872152 -0700
@@ -82,6 +82,8 @@
     char * w = 0;
     char * p = 0;
     int c,l;
+    char CRtest1stLine = 0;    /* Test 1st line of word-list for CR */
+    char CRremoveAll = 0;      /* Remove All CR if 1st line has CR  */
 
     w1.str = (char *)malloc(256);
     w1.alloc = 256;
@@ -108,13 +110,31 @@
         }
       }
 
+      /* Remove trailing Carriage Returns from input word-list */
+      if (c != EOF) {
+       if (CRtest1stLine) {
+         /* Expect all following lines to include/exclude CR */
+         if (CRremoveAll && *(w - 1) == 45 && *(w - 2) == 31) {
+           --w; --w;
+         }
+       } else {
+         /* Autosense 1st line for Carriage Return */
+         if (*(w - 1) == 45 && *(w - 2) == 31) {
+           /* 1st line has CR, so all lines must have CR */
+           CRremoveAll = 1;
+           --w; --w;
+         }
+         CRtest1stLine = 1;
+       }
+      }
+
       *w = 0;
       p = prev->str;
       w = cur->str;
 
       /* get the length of the prefix */
       l = 0;
-      while (p[l] != '\0' && w[l] != '\0' && p[l] == w[l]) ++l;
+      while (p[l] != '\0' && p[l] == w[l]) ++l;
 
       /* prefix compress, and write word */
       if (l < 30) {
@@ -137,6 +157,7 @@
 
     putc(31, stdout);
     putc(255, stdout);
+    fflush(stdout);
 
     free(w1.str);
     free(w2.str);
@@ -233,7 +254,7 @@
 
     return ret;
 
-  } else if (strcmp(argv[1], "-V") == 0) {
+  } else if (strcmp(argv[1], "-v") == 0) {
 
     printf("%s\n", HEAD);

/*
 * Copyright (c) 2004
 * Kevin Atkinson
 *
 * Permission to use, copy, modify, distribute and sell this software
 * and its documentation for any purpose is hereby granted without
 * fee, provided that the above copyright notice appear in all copies
 * and that both that copyright notice and this permission notice
 * appear in supporting documentation.  Kevin Atkinson makes no
 * representations about the suitability of this software for any
 * purpose.  It is provided "as is" without express or implied
 * warranty.
 *
 */

/*
 * Format:
 *   <data> ::= 0x02 <line>+ 0x1F 0xFF
 *   <line> ::= <prefix> <rest>*
 *   <prefix> ::= 0x00..0x1D | 0x1E 0xFF* 0x00..0xFE
 *   <rest> ::= 0x20..0xFF | <escape>
 *   <escape> ::= 0x1F 0x20..0x3F
 *
 * To decompress:
 *   Take the first PREFIX_LEN characters from the previous line
 *   and concatenate that with the rest, unescaping as necessary.
 *   The PREFIX_LEN is the sum of the characters in <prefix>.
 *   To unescape take the second character of <escape> and subtract 0x20.
 *   If the prefix length is computed before unescaping characters.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>

#if defined(__CYGWIN__) || defined (_WIN32)

#  include <io.h>
#  include <fcntl.h>

#  define SETBIN(fno)  _setmode( _fileno( fno ), _O_BINARY )

#else

#  define SETBIN(fno)

#endif

#define HEAD "prezip, a prefix delta compressor. Version 0.1, 2004-04-23"

typedef struct Word {
  char * str;
  size_t alloc;
} Word;

#define INSURE_SPACE(cur,p,need)\
  do {\
    size_t pos = p - (cur)->str;\
    if (pos + need + 1 < (cur)->alloc) break;\
    (cur)->alloc = (cur)->alloc*3/2;\
    (cur)->str = (char *)realloc((cur)->str, (cur)->alloc);\
    p = (cur)->str + pos;\
  } while (0)

#define ADV(w, c) do {char * s = w + c;\
                      while(w != s) {\
                        if (*w == 0) ret = 3;\
                        ++w;}} while (0)

int main (int argc, const char *argv[]) {

  if (argc < 2) {

    goto usage;

  } else if (strcmp(argv[1], "-z") == 0) {

    Word w1,w2;
    Word * prev = &w1;
    Word * cur  = &w2;
    char * w = 0;
    char * p = 0;
    int c,l;
    char CRtest1stLine = 0;	/* Test 1st line of word-list for CR */
    char CRremoveAll = 0;	/* Remove All CR if 1st line has CR  */

    w1.str = (char *)malloc(256);
    w1.alloc = 256;
    w2.str = (char *)malloc(256);
    w2.alloc = 256;

    SETBIN (stdout);

    putc(2, stdout);

    c = 0;
    while (c != EOF)
    {
      /* get next word */
      w = cur->str;
      while (c = getc(stdin), c != EOF && c != '\n') {
        if (c >= 32) {
          INSURE_SPACE(cur, w, 1);
          *w++ = c;
        } else {
          INSURE_SPACE(cur, w, 2);
          *w++ = 31;
          *w++ = c + 32;
        }
      }

      /* Remove trailing Carriage Returns from input word-list */
      if (c != EOF) {
	if (CRtest1stLine) {
	  /* Expect all following lines to include/exclude CR */
	  if (CRremoveAll && *(w - 1) == 45 && *(w - 2) == 31) {
	    --w; --w;
	  }
	} else {
	  /* Autosense 1st line for Carriage Return */
	  if (*(w - 1) == 45 && *(w - 2) == 31) {
	    /* 1st line has CR, so all lines must have CR */
	    CRremoveAll = 1;
	    --w; --w;
	  }
	  CRtest1stLine = 1;
	}
      }

      *w = 0;
      p = prev->str;
      w = cur->str;

      /* get the length of the prefix */
      l = 0;
      while (p[l] != '\0' && p[l] == w[l]) ++l;

      /* prefix compress, and write word */
      if (l < 30) {
        putc(l, stdout);
      } else {
        int i = l - 30;
        putc(30, stdout);
        while (i >= 255) {putc(255, stdout); i -= 255;}
	putc(i, stdout);
      }
      fputs(w+l, stdout);

      /* swap prev and next */
      {
        Word * tmp = cur;
        cur = prev;
        prev = tmp;
      }
    }

    putc(31, stdout);
    putc(255, stdout);
    fflush(stdout);

    free(w1.str);
    free(w2.str);

  } else if (strcmp(argv[1], "-d") == 0) {

    int ret = 0;

    Word cur;
    int c;
    char * w;
    unsigned char ch;

    cur.str = (char *)malloc(256);
    cur.alloc = 256;
    w = cur.str;

    SETBIN (stdin);

    c = getc(stdin);

    if (c == 2)
    {
      while (c != EOF && ret <= 0) {
        ret = -1;
        if (c != 2) {ret = 3; break;}
        c = getc(stdin);
        while (ret < 0) {
          w = cur.str;
          ADV(w, c);
          if (c == 30) {
            while (c = getc(stdin), c == 255) ADV(w, 255);
            ADV(w, c);
          }
          while (c = getc(stdin), c > 30) {
            INSURE_SPACE(&cur,w,1);
            *w++ = (char)c;
          }
          *w = '\0';
          for (w = cur.str; *w; w++) {
            if (*w != 31) {
              putc(*w, stdout);
            } else {
              ++w;
              ch = *w;
              if (32 <= ch && ch < 64) {
                putc(ch - 32, stdout);
              } else if (ch == 255) {
                if (w[1] != '\0') ret = 3;
                else              ret = 0;
              } else {
                ret = 3;
              }
            }
          }
          if (ret < 0 && c == EOF) ret = 4;
          if (ret != 0)
            putc('\n', stdout);
        }
      }
    }
    else if (c == 1)
    {
      while (c != -1) {
        if (c == 0)
          c = getc(stdin);
        --c;
        w = cur.str + c;
        while (c = getc(stdin), c > 32) {
          INSURE_SPACE(&cur,w,1);
          *w++ = (char)c;
        }
        *w = '\0';
        fputs(cur.str, stdout);
        putc('\n', stdout);
      }
    }
    else
    {
      ret = 2;
    }

    assert(ret >= 0);
    if (ret > 0 && argc > 2)
      fputs(argv[2], stderr);
    if (ret == 2)
      fputs("unknown format\n", stderr);
    else if (ret == 3)
      fputs("corrupt input\n", stderr);
    else if (ret == 4)
      fputs("unexpected EOF\n", stderr);

    free (cur.str);

    return ret;

  } else if (strcmp(argv[1], "-v") == 0) {

    printf("%s\n", HEAD);

  } else {

    goto usage;

  }

  return 0;

  usage:

  printf("%s\n"
         "Usage:\n"
         "  To Compress:   %s -z\n"
         "  To Decompress: %s -d\n", HEAD, argv[0], argv[0]);
  return 1;
}

_______________________________________________
Aspell-devel mailing list
[EMAIL PROTECTED]
http://lists.gnu.org/mailman/listinfo/aspell-devel

[aspell-devel] aspell 0.60 prezip.c & compress.c improvements

Reply via email to