Neil Hodgson wrote:
Robert Roessler:
BTW, I had rounded up a replacement hashing function some time ago -
it gives better distribution than the stock one used by Scintilla,
which of course translates to fewer items sharing buckets. If someone
is interested...
You are probably thinking of PropSet which should probably be
turned into a real hash table as the data sets it is used on are much
larger than it was originally written for. For keywords its
WordList::InList which after the first character does a linear search.
SinkWorld uses a trie and maps the word to an integer so its easy to
have several lists of identifiers with different styles.
Well, I have "adjusted" PropSet to use the well-regarded and much
discussed "SuperFastHash" by Paul Hsieh at
http://www.azillionmonkeys.com/qed/hash.html
I have included a copy of propset.h as an attachment (it's only 4.5 KB).
Comments on this proposed update:
1) As already mentioned, the code is quite well picked over and discussed
2) I packaged it in a very contained way - just replacing the previous
[inline] definition of PropSet::HashString in propset.h
3) The legalities are discussed in some length by the author at
http://www.azillionmonkeys.com/qed/weblicense.html
By stripping the already quite sparse comments, I believe the borrowed
"raw source code" qualifies under the author's "derivative" license,
and Scintilla's own licensing terms do not invalidate this.
4) While the algorithm is discussed [again] in great length in the
first link above, the gist of it is to do the best job possible to
evenly distribute the effects of each byte in the input string across
ALL of the output bits (sort of the "Mom and apple pie" of hashing).
As the whole point is to cleanly spread across your buckets to reduce
collisions and subsequent linear searching behavior in your lookups, a
little more time is [well] spent in the hash itself.
Usage-wise, given the [even] distribution across all of the bits in
the hash value, one only needs to grab any subset of these; any
operation which effectively takes the low-order bits is adequate. As
for Scintilla/SciTE, this means making PropSet::hashRoots a power of
two, so that the later "% hashRoots" that is always used on the result
of hashing will use all of the buckets. Put differently, because of
the characteristics of this hash's distribution, a prime number modulo
-based approach is not required (or suggested).
Properly balancing size and performance is the trick here - since
PropSet gets used in a one-size-fits-all way (with large sets but with
space still mattering), I have used 256 buckets as a compromise.
Going larger just gets better (my own heavily instrumented use of this
algorithm was with 4096 buckets), so upping this might make sense.
This is really up to Neil, who *may* want to make it a configurable
option, so that say, SciTE could use a larger number like 4096, while
more constrained uses of Scintilla would use 256 (or even 32).
Robert Roessler
[EMAIL PROTECTED]
http://www.rftp.com
// Scintilla source code edit control
/** @file PropSet.h
** A Java style properties file module.
**/
// Copyright 1998-2002 by Neil Hodgson <[EMAIL PROTECTED]>
// The License.txt file describes the conditions under which this software may
be distributed.
#ifndef PROPSET_H
#define PROPSET_H
#include "SString.h"
bool EqualCaseInsensitive(const char *a, const char *b);
bool isprefix(const char *target, const char *prefix);
struct Property {
unsigned int hash;
char *key;
char *val;
Property *next;
Property() : hash(0), key(0), val(0), next(0) {}
};
/**
*/
class PropSet {
protected:
enum { hashRoots=256 };
Property *props[hashRoots];
Property *enumnext;
int enumhash;
static bool caseSensitiveFilenames;
// use Paul Hsieh's "SuperFastHash"
// (from http://www.azillionmonkeys.com/qed/hash.html)
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned long uint32_t;
#undef get16bits
#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
|| defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
#define get16bits(d) (*((const uint16_t *) (d)))
#endif
#if !defined (get16bits)
#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+(uint32_t)(((const uint8_t
*)(d))[0]))
#endif
static unsigned int HashString(const char *data, size_t len) {
uint32_t hash = len;
if (len <= 0 || data == NULL) return 0;
const int rem = len & 3;
len >>= 2;
for (;len > 0; len--) {
hash += get16bits (data);
const uint32_t tmp = (get16bits (data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*sizeof (uint16_t);
hash += hash >> 11;
}
switch (rem) {
case 3: hash += get16bits (data);
hash ^= hash << 16;
hash ^= data[sizeof (uint16_t)] << 18;
hash += hash >> 11;
break;
case 2: hash += get16bits (data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1: hash += *data;
hash ^= hash << 10;
hash += hash >> 1;
break;
}
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;
return hash;
}
static bool IncludesVar(const char *value, const char *key);
public:
PropSet *superPS;
PropSet();
~PropSet();
void Set(const char *key, const char *val, int lenKey=-1, int
lenVal=-1);
void Set(const char *keyVal);
void Unset(const char *key, int lenKey=-1);
void SetMultiple(const char *s);
SString Get(const char *key);
SString GetExpanded(const char *key);
SString Expand(const char *withVars, int maxExpands=100);
int GetInt(const char *key, int defaultValue=0);
SString GetWild(const char *keybase, const char *filename);
SString GetNewExpand(const char *keybase, const char *filename="");
void Clear();
char *ToString(); // Caller must delete[] the return value
bool GetFirst(char **key, char **val);
bool GetNext(char **key, char **val);
static void SetCaseSensitiveFilenames(bool caseSensitiveFilenames_) {
caseSensitiveFilenames = caseSensitiveFilenames_;
}
private:
// copy-value semantics not implemented
PropSet(const PropSet ©);
void operator=(const PropSet &assign);
};
/**
*/
class WordList {
public:
// Each word contains at least one character - a empty word acts as
sentinel at the end.
char **words;
char **wordsNoCase;
char *list;
int len;
bool onlyLineEnds; ///< Delimited by any white space or only line
ends
bool sorted;
bool sortedNoCase;
int starts[256];
WordList(bool onlyLineEnds_ = false) :
words(0), wordsNoCase(0), list(0), len(0),
onlyLineEnds(onlyLineEnds_),
sorted(false), sortedNoCase(false) {}
~WordList() { Clear(); }
operator bool() { return len ? true : false; }
char *operator[](int ind) { return words[ind]; }
void Clear();
void Set(const char *s);
char *Allocate(int size);
void SetFromAllocated();
bool InList(const char *s);
bool InListAbbreviated(const char *s, const char marker);
const char *GetNearestWord(const char *wordStart, int searchLen,
bool ignoreCase = false, SString wordCharacters="", int
wordIndex = -1);
char *GetNearestWords(const char *wordStart, int searchLen,
bool ignoreCase=false, char otherSeparator='\0', bool
exactLen=false);
};
inline bool IsAlphabetic(unsigned int ch) {
return ((ch >= 'A') && (ch <= 'Z')) || ((ch >= 'a') && (ch <= 'z'));
}
#ifdef _MSC_VER
// Visual C++ doesn't like the private copy idiom for disabling
// the default copy constructor and operator=, but it's fine.
#pragma warning(disable: 4511 4512)
#endif
#endif
_______________________________________________
Scite-interest mailing list
[email protected]
http://mailman.lyra.org/mailman/listinfo/scite-interest