Here's a quickie that someone else might like to verify if they've
run into the same problem. When htdig encounters an entity that it
doesn't know about (say ’ - which should really be ’ but
that's another issue) it copies it verbatim to the extract - so far
so good. When the extract is sent out in Display::hilight, the
extract is decoded with HtSGMLCodec to transform the unsigned char
characters to entities, and as well as the characters above 160 it
translates & to &, which is fine except when & is the start of
an entity. This is what leaves things like &146; in extracts.
Here's a patch to HtSGMLCodec::decode to make sure that it doesn't break
real entities.
======================================
diff -rup htdig/htcommon/HtSGMLCodec.cc
htdig-patch2/htcommon/HtSGMLCodec.cc
--- htdig/htcommon/HtSGMLCodec.cc Fri Oct 20 16:40:55 2000
+++ htdig-patch2/htcommon/HtSGMLCodec.cc Tue Oct 16 15:37:05 2001
@@ -19,6 +19,8 @@
#include "HtSGMLCodec.h"
+#include <ctype.h>
+
// Constructor: parses the appropriate parameters using the
// encapsulated HtWordCodec class.
// Only used in privacy.
@@ -106,5 +108,92 @@ HtSGMLCodec::instance()
return _instance;
}
+
+
+// ***********************************************
+int
+HtSGMLCodec::IsEntity( const String &entity ) const
+{
+ // entity if starts with &, finishes with ;, has no spaces, is at
least 3 chars long
+ // if second char is # and the third is not x, the others are decimal
digits, min len 4
+ // if the entity starts with &#x then the remaining digits must be
hexidecimal, min len 5
+ // I'm not supporting entities that don't end with a semi-colon.
+
+ int is_decimal = 0;
+ int is_hex = 0;
+ int len = entity.length();
+ int start = 1;
+
+ if (len < 3 && entity[0] != '&' && entity[len-1] != ';' )
+ return 0;
+
+ if ( entity[1] == '#' )
+ {
+ if ( len > 3 && ( entity[2] == 'x' || entity[2] == 'X' ) ) {
+ is_hex = 1;
+ start = 3;
+ if ( len < 5 )
+ return 0;
+ } else {
+ is_decimal = 1;
+ start = 2;
+ if ( len < 4 )
+ return 0;
+ }
+ }
+
+ for (int i = start; i < len-start-1; i++ )
+ {
+ if ( !isalnum( entity[i] ) )
+ return 0;
+
+ if ( is_decimal && !isdigit( entity[i] ) )
+ return 0;
+ if ( is_hex && !isxdigit( entity[i] ) )
+ return 0;
+ }
+ return 1;
+}
+
+
+// ***********************************************
+String HtSGMLCodec::decode(const String &coded) const
+{
+ String out;
+ int semi_pos = -1;
+ int amp_pos = coded.indexOf( '&' );
+ int last_pos = 0;
+
+ while( last_pos <= coded.length() )
+ {
+ amp_pos = coded.indexOf( '&', last_pos );
+
+ if ( amp_pos != -1 )
+ semi_pos = coded.indexOf( ';', amp_pos+1 );
+ else
+ semi_pos = -1;
+
+ if ( amp_pos == -1 || semi_pos == -1 ) // no more possible
entities
+ {
+ out << myTextWordCodec->decode( coded.sub( last_pos) );
+ break;
+ }
+ semi_pos++; // jump over the semi-colon
+
+ if ( IsEntity( coded.sub(amp_pos, semi_pos - amp_pos ) ) )
+ {
+ out << myTextWordCodec->decode( coded.sub(last_pos, amp_pos -
last_pos ) );
+ out << coded.sub(amp_pos, semi_pos - amp_pos );
+ }
+ else
+ {
+ out << myTextWordCodec->decode( coded.sub( last_pos, semi_pos
- amp_pos ) );
+ }
+ last_pos = semi_pos;
+ }
+
+ return out;
+}
+
// End of HtSGMLCodec.cc
diff -rup htdig/htcommon/HtSGMLCodec.h htdig-patch2/htcommon/HtSGMLCodec.h
--- htdig/htcommon/HtSGMLCodec.h Fri Oct 20 16:40:55 2000
+++ htdig-patch2/htcommon/HtSGMLCodec.h Tue Oct 16 15:33:08 2001
@@ -33,8 +33,8 @@ public:
{ return myTextWordCodec->encode(myNumWordCodec->encode(uncoded)); }
// But we only want to decode into one form i.e. &foo; NOT &#nnn;
- String decode(const String &coded) const
- { return myTextWordCodec->decode(coded); }
+ // but we don't want to decode & if it's part of an entity.
+ String decode(const String &coded) const;
// If an error was discovered during the parsing of
// entities, this returns an error message
@@ -54,6 +54,9 @@ private:
HtSGMLCodec();
HtSGMLCodec(const HtSGMLCodec &);
void operator= (const HtSGMLCodec &);
+
+ //! returns true if the parameter is an entity.
+ int IsEntity( const String &entity ) const;
HtWordCodec *myTextWordCodec; // For &foo;
HtWordCodec *myNumWordCodec; // For &#foo;
======================================
Jamie Anstice
Search Engineer
S.L.I. Systems
[EMAIL PROTECTED]
ph: 64 961 3262
mobile: 64 21 264 9347
_______________________________________________
htdig-general mailing list <[EMAIL PROTECTED]>
To unsubscribe, send a message to <[EMAIL PROTECTED]> with a
subject of unsubscribe
FAQ: http://htdig.sourceforge.net/FAQ.html