At 16:17 04.01.00 -0600, you wrote:
>At 12:08 AM +0100 1/4/00, Marc Pohl wrote:
>>Hi,
>>
>>the last weeks i wondered why htdig don't like any words with the 
>>german U umlaut (char 252) on my solaris server. All locale setting 
>>were correct and the same configuration runs on a linux box without 
>>any problems.
>>
>>Today i discovered, that the reason for that is, that 
>>WordList::valid_word() is not 8-bit-clean on Sun Solaris 2.6 !
>>(iscntrl(252) gets 1, but iscntrl((unsigned char)252) is 0)
>
>Yes, you are correct, this is a bug.
>Thanks a bunch for your patch!
>
>-Geoff
>

Hello Geoff,

i reviewed the sourcecode for htdig-3.2.0b1-dev-010900 this weekend and discovered 
that there could be similar errors in htword/WordType.cc because of signed char to int 
casts. The exactly same error cannot happen because the iscntrl() is in the else 
branch of IsStrictChar() in 3.2.

My proposed patch is the following snippet, introducing two new member functions to 
WordType, instead of calling isdigit() and iscntrl() directly.

*** WordType.h.orig     Sun Jan  9 14:16:21 2000
--- WordType.h  Sun Jan  9 14:52:18 2000
***************
*** 69,74 ****
--- 69,76 ----
    // 
    int IsChar(int c) const;
    int IsStrictChar(int c) const;
+   int IsDigit(int c) const;
+   int IsControl(int c) const;
  
    //
    // Transformations
***************
*** 99,104 ****
--- 101,107 ----
  #define WORD_TYPE_DIGIT       0x02
  #define WORD_TYPE_EXTRA       0x04
  #define WORD_TYPE_VALIDPUNCT  0x08
+ #define WORD_TYPE_CONTROL     0x10
  
  // One for characters that when put together are a word
  // (including punctuation).
***************
*** 113,118 ****
--- 116,135 ----
  WordType::IsStrictChar(int c) const
  {
    return (chrtypes[(unsigned char)c] & 
(WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA)) != 0;
+ }
+ 
+ // Reimplementation of isdigit() using the lookup table chrtypes[] 
+ inline int
+ WordType::IsDigit(int c) const
+ {
+   return (chrtypes[(unsigned char)c] & WORD_TYPE_DIGIT) != 0;
+ }
+ 
+ // Similar to IsDigit, but for iscntrl()
+ inline int
+ WordType::IsControl(int c) const
+ {
+   return (chrtypes[(unsigned char)c] & WORD_TYPE_CONTROL) != 0;
  }
  
  // Let caller get rid of getting and holding a configuration parameter.


*** WordType.cc.orig    Sun Jan  9 14:16:26 2000
--- WordType.cc Sun Jan  9 15:28:09 2000
***************
*** 64,69 ****
--- 64,71 ----
        chrtypes[i] |= WORD_TYPE_ALPHA;
      if (isdigit(i))
        chrtypes[i] |= WORD_TYPE_DIGIT;
+     if (iscntrl(i))
+       chrtypes[i] |= WORD_TYPE_CONTROL;
      if (strchr(extra_word_chars, i))
        chrtypes[i] |= WORD_TYPE_EXTRA;
      if (strchr(valid_punct, i))
***************
*** 148,157 ****
    // Reject if contains control characters
    //
    int alpha = 0;
!   for(const char *p = (char*)word; *p; p++) {
!     if(IsStrictChar((unsigned char)*p) || (allow_numbers && isdigit(*p))) {
        alpha = 1;
!     } else if(iscntrl(*p)) {
        return status | WORD_NORMALIZE_CONTROL;
      }
    }
--- 150,159 ----
    // Reject if contains control characters
    //
    int alpha = 0;
!   for(const unsigned char *p = (const unsigned char*)(const char*)word; *p; p++) {
!     if(IsStrictChar(*p) || (allow_numbers && IsDigit(*p))) {
        alpha = 1;
!     } else if(IsControl(*p)) {
        return status | WORD_NORMALIZE_CONTROL;
      }
    }



Marc



-----------------------------------------------------------------------
Marc Pohl, Online-Service-Center, Westdeutscher Rundfunk, D-50600 Koeln
[EMAIL PROTECTED], +49 221 220 8618,  http://www.wdr.de/
-----------------------------------------------------------------------


------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] 
You will receive a message to confirm this. 

Reply via email to