Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)

Tom Lane Tue, 08 Jan 2019 12:05:14 -0800

John Naylor <[email protected]> writes:
> On Tue, Jan 8, 2019 at 12:06 PM Andrew Dunstan
> <[email protected]> wrote:
>> If he doesn't I will.


> I'll take a crack at separating into a module.  I'll wait a bit in
> case there are any stylistic suggestions on the patch as it stands.

I had a go at that myself.  I'm sure there's plenty to criticize in
the result, but at least it passes make check-world ;-)

I resolved the worry I had last night about the range of table values
by putting in logic to check the range and choose a suitable table
element type.  There are a couple of existing calls where we manage
to fit the hashtable elements into int8 that way; of course, by
definition that doesn't save a whole lot of space since such tables
couldn't have many elements, but it seems cleaner anyway.

                        regards, tom lane

diff --git a/src/common/Makefile b/src/common/Makefile
index 317b071..d0c2b97 100644
*** a/src/common/Makefile
--- b/src/common/Makefile
*************** OBJS_FRONTEND = $(OBJS_COMMON) fe_memuti
*** 63,68 ****
--- 63,73 ----
  OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o)
  OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
  
+ # where to find gen_keywordlist.pl and subsidiary files
+ TOOLSDIR = $(top_srcdir)/src/tools
+ GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
+ GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
+ 
  all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a
  
  distprep: kwlist_d.h
*************** libpgcommon_srv.a: $(OBJS_SRV)
*** 118,125 ****
  	$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
  
  # generate SQL keyword lookup table to be included into keywords*.o.
! kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl
! 	$(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $<
  
  # Dependencies of keywords*.o need to be managed explicitly to make sure
  # that you don't get broken parsing code, even in a non-enable-depend build.
--- 123,130 ----
  	$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
  
  # generate SQL keyword lookup table to be included into keywords*.o.
! kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS)
! 	$(GEN_KEYWORDLIST) --extern $<
  
  # Dependencies of keywords*.o need to be managed explicitly to make sure
  # that you don't get broken parsing code, even in a non-enable-depend build.
diff --git a/src/common/kwlookup.c b/src/common/kwlookup.c
index d72842e..9dc1fee 100644
*** a/src/common/kwlookup.c
--- b/src/common/kwlookup.c
***************
*** 35,94 ****
   * receive a different case-normalization mapping.
   */
  int
! ScanKeywordLookup(const char *text,
  				  const ScanKeywordList *keywords)
  {
! 	int			len,
! 				i;
! 	char		word[NAMEDATALEN];
! 	const char *kw_string;
! 	const uint16 *kw_offsets;
! 	const uint16 *low;
! 	const uint16 *high;
! 
! 	len = strlen(text);
  
  	if (len > keywords->max_kw_len)
! 		return -1;				/* too long to be any keyword */
! 
! 	/* We assume all keywords are shorter than NAMEDATALEN. */
! 	Assert(len < NAMEDATALEN);
  
  	/*
! 	 * Apply an ASCII-only downcasing.  We must not use tolower() since it may
! 	 * produce the wrong translation in some locales (eg, Turkish).
  	 */
! 	for (i = 0; i < len; i++)
! 	{
! 		char		ch = text[i];
  
! 		if (ch >= 'A' && ch <= 'Z')
! 			ch += 'a' - 'A';
! 		word[i] = ch;
! 	}
! 	word[len] = '\0';
  
  	/*
! 	 * Now do a binary search using plain strcmp() comparison.
  	 */
! 	kw_string = keywords->kw_string;
! 	kw_offsets = keywords->kw_offsets;
! 	low = kw_offsets;
! 	high = kw_offsets + (keywords->num_keywords - 1);
! 	while (low <= high)
  	{
! 		const uint16 *middle;
! 		int			difference;
  
! 		middle = low + (high - low) / 2;
! 		difference = strcmp(kw_string + *middle, word);
! 		if (difference == 0)
! 			return middle - kw_offsets;
! 		else if (difference < 0)
! 			low = middle + 1;
! 		else
! 			high = middle - 1;
  	}
  
! 	return -1;
  }
--- 35,89 ----
   * receive a different case-normalization mapping.
   */
  int
! ScanKeywordLookup(const char *str,
  				  const ScanKeywordList *keywords)
  {
! 	size_t		len;
! 	int			h;
! 	const char *kw;
  
+ 	/*
+ 	 * Reject immediately if too long to be any keyword.  This saves useless
+ 	 * hashing and downcasing work on long strings.
+ 	 */
+ 	len = strlen(str);
  	if (len > keywords->max_kw_len)
! 		return -1;
  
  	/*
! 	 * Compute the hash function.  We assume it was generated to produce
! 	 * case-insensitive results.  Since it's a perfect hash, we need only
! 	 * match to the specific keyword it identifies.
  	 */
! 	h = keywords->hash(str, len);
  
! 	/*
! 	 * An out-of-range result implies no match.  (This can happen for
! 	 * non-keyword inputs because the hash function will sum two unrelated
! 	 * hashtable entries.)
! 	 */
! 	if (h < 0 || h >= keywords->num_keywords)
! 		return -1;
  
  	/*
! 	 * Compare character-by-character to see if we have a match, applying an
! 	 * ASCII-only downcasing to the input characters.  We must not use
! 	 * tolower() since it may produce the wrong translation in some locales
! 	 * (eg, Turkish).
  	 */
! 	kw = GetScanKeyword(h, keywords);
! 	while (*str != '\0')
  	{
! 		char		ch = *str++;
  
! 		if (ch >= 'A' && ch <= 'Z')
! 			ch += 'a' - 'A';
! 		if (ch != *kw++)
! 			return -1;
  	}
+ 	if (*kw != '\0')
+ 		return -1;
  
! 	/* Success! */
! 	return h;
  }
diff --git a/src/include/common/kwlookup.h b/src/include/common/kwlookup.h
index 39efb35..dbff367 100644
*** a/src/include/common/kwlookup.h
--- b/src/include/common/kwlookup.h
***************
*** 14,19 ****
--- 14,22 ----
  #ifndef KWLOOKUP_H
  #define KWLOOKUP_H
  
+ /* Hash function used by ScanKeywordLookup */
+ typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen);
+ 
  /*
   * This struct contains the data needed by ScanKeywordLookup to perform a
   * search within a set of keywords.  The contents are typically generated by
*************** typedef struct ScanKeywordList
*** 23,28 ****
--- 26,32 ----
  {
  	const char *kw_string;		/* all keywords in order, separated by \0 */
  	const uint16 *kw_offsets;	/* offsets to the start of each keyword */
+ 	ScanKeywordHashFunc hash;	/* perfect hash function for keywords */
  	int			num_keywords;	/* number of keywords */
  	int			max_kw_len;		/* length of longest keyword */
  } ScanKeywordList;
diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile
index b5b74a3..6c02f97 100644
*** a/src/interfaces/ecpg/preproc/Makefile
--- b/src/interfaces/ecpg/preproc/Makefile
*************** OBJS=	preproc.o pgc.o type.o ecpg.o outp
*** 28,34 ****
  	keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
  	$(WIN32RES)
  
! GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
  
  # Suppress parallel build to avoid a bug in GNU make 3.82
  # (see comments in ../Makefile)
--- 28,37 ----
  	keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
  	$(WIN32RES)
  
! # where to find gen_keywordlist.pl and subsidiary files
! TOOLSDIR = $(top_srcdir)/src/tools
! GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
! GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
  
  # Suppress parallel build to avoid a bug in GNU make 3.82
  # (see comments in ../Makefile)
*************** preproc.y: ../../../backend/parser/gram.
*** 56,66 ****
  	$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
  
  # generate keyword headers
! c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
! 	$(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<
  
! ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
! 	$(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
  
  # Force these dependencies to be known even without dependency info built:
  ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h
--- 59,69 ----
  	$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
  
  # generate keyword headers
! c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS)
! 	$(GEN_KEYWORDLIST) --varname ScanCKeywords --case $<
  
! ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS)
! 	$(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
  
  # Force these dependencies to be known even without dependency info built:
  ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h
diff --git a/src/interfaces/ecpg/preproc/c_keywords.c b/src/interfaces/ecpg/preproc/c_keywords.c
index 38ddf6f..387298b 100644
*** a/src/interfaces/ecpg/preproc/c_keywords.c
--- b/src/interfaces/ecpg/preproc/c_keywords.c
***************
*** 9,16 ****
   */
  #include "postgres_fe.h"
  
- #include <ctype.h>
- 
  #include "preproc_extern.h"
  #include "preproc.h"
  
--- 9,14 ----
*************** static const uint16 ScanCKeywordTokens[]
*** 32,70 ****
   *
   * Returns the token value of the keyword, or -1 if no match.
   *
!  * Do a binary search using plain strcmp() comparison.  This is much like
   * ScanKeywordLookup(), except we want case-sensitive matching.
   */
  int
! ScanCKeywordLookup(const char *text)
  {
! 	const char *kw_string;
! 	const uint16 *kw_offsets;
! 	const uint16 *low;
! 	const uint16 *high;
  
! 	if (strlen(text) > ScanCKeywords.max_kw_len)
! 		return -1;				/* too long to be any keyword */
  
! 	kw_string = ScanCKeywords.kw_string;
! 	kw_offsets = ScanCKeywords.kw_offsets;
! 	low = kw_offsets;
! 	high = kw_offsets + (ScanCKeywords.num_keywords - 1);
  
! 	while (low <= high)
! 	{
! 		const uint16 *middle;
! 		int			difference;
  
! 		middle = low + (high - low) / 2;
! 		difference = strcmp(kw_string + *middle, text);
! 		if (difference == 0)
! 			return ScanCKeywordTokens[middle - kw_offsets];
! 		else if (difference < 0)
! 			low = middle + 1;
! 		else
! 			high = middle - 1;
! 	}
  
  	return -1;
  }
--- 30,71 ----
   *
   * Returns the token value of the keyword, or -1 if no match.
   *
!  * Do a hash search using plain strcmp() comparison.  This is much like
   * ScanKeywordLookup(), except we want case-sensitive matching.
   */
  int
! ScanCKeywordLookup(const char *str)
  {
! 	size_t		len;
! 	int			h;
! 	const char *kw;
  
! 	/*
! 	 * Reject immediately if too long to be any keyword.  This saves useless
! 	 * hashing work on long strings.
! 	 */
! 	len = strlen(str);
! 	if (len > ScanCKeywords.max_kw_len)
! 		return -1;
  
! 	/*
! 	 * Compute the hash function.  Since it's a perfect hash, we need only
! 	 * match to the specific keyword it identifies.
! 	 */
! 	h = ScanCKeywords_hash_func(str, len);
  
! 	/*
! 	 * An out-of-range result implies no match.  (This can happen for
! 	 * non-keyword inputs because the hash function will sum two unrelated
! 	 * hashtable entries.)
! 	 */
! 	if (h < 0 || h >= ScanCKeywords.num_keywords)
! 		return -1;
  
! 	kw = GetScanKeyword(h, &ScanCKeywords);
! 
! 	if (strcmp(kw, str) == 0)
! 		return ScanCKeywordTokens[h];
  
  	return -1;
  }
diff --git a/src/pl/plpgsql/src/Makefile b/src/pl/plpgsql/src/Makefile
index 9dd4a74..8a0f294 100644
*** a/src/pl/plpgsql/src/Makefile
--- b/src/pl/plpgsql/src/Makefile
*************** REGRESS_OPTS = --dbname=$(PL_TESTDB)
*** 29,35 ****
  REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \
  	plpgsql_cache plpgsql_transaction plpgsql_varprops
  
! GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
  
  all: all-lib
  
--- 29,38 ----
  REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \
  	plpgsql_cache plpgsql_transaction plpgsql_varprops
  
! # where to find gen_keywordlist.pl and subsidiary files
! TOOLSDIR = $(top_srcdir)/src/tools
! GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
! GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
  
  all: all-lib
  
*************** plerrcodes.h: $(top_srcdir)/src/backend/
*** 76,86 ****
  	$(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@
  
  # generate keyword headers for the scanner
! pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST)
! 	$(PERL) $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
  
! pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST)
! 	$(PERL) $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
  
  
  check: submake
--- 79,89 ----
  	$(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@
  
  # generate keyword headers for the scanner
! pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
! 	$(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
  
! pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
! 	$(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
  
  
  check: submake
diff --git a/src/tools/PerfectHash.pm b/src/tools/PerfectHash.pm
index ...34d55cf .
*** a/src/tools/PerfectHash.pm
--- b/src/tools/PerfectHash.pm
***************
*** 0 ****
--- 1,336 ----
+ #----------------------------------------------------------------------
+ #
+ # PerfectHash.pm
+ #    Perl module that constructs minimal perfect hash functions
+ #
+ # This code constructs a minimal perfect hash function for the given
+ # set of keys, using an algorithm described in
+ # "An optimal algorithm for generating minimal perfect hash functions"
+ # by Czech, Havas and Majewski in Information Processing Letters,
+ # 43(5):256-264, October 1992.
+ # This implementation is loosely based on NetBSD's "nbperf",
+ # which was written by Joerg Sonnenberger.
+ #
+ # The resulting hash function is perfect in the sense that if the presented
+ # key is one of the original set, it will return the key's index in the set
+ # (in range 0..N-1).  However, the caller must still verify the match,
+ # as false positives are possible.  Also, the hash function may return
+ # values that are out of range (negative, or >= N).  This indicates that
+ # the presented key is definitely not in the set.
+ #
+ #
+ # Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ # Portions Copyright (c) 1994, Regents of the University of California
+ #
+ # src/tools/PerfectHash.pm
+ #
+ #----------------------------------------------------------------------
+ 
+ package PerfectHash;
+ 
+ use strict;
+ use warnings;
+ use Exporter 'import';
+ 
+ our @EXPORT_OK = qw(
+   generate_hash_function
+ );
+ 
+ 
+ # At runtime, we'll compute two simple hash functions of the input key,
+ # and use them to index into a mapping table.  The hash functions are just
+ # multiply-and-add in uint32 arithmetic, with different multipliers but
+ # the same initial seed.  All the complexity in this module is concerned
+ # with selecting hash parameters that will work and building the mapping
+ # table.
+ 
+ # We support making case-insensitive hash functions, though this only
+ # works for a strict-ASCII interpretation of case insensitivity,
+ # ie, A-Z maps onto a-z and nothing else.
+ my $case_insensitive = 0;
+ 
+ 
+ #
+ # Construct a C function implementing a perfect hash for the given keys.
+ # The C function definition is returned as a string.
+ #
+ # The keys can be any set of Perl strings; it is caller's responsibility
+ # that there not be any duplicates.  (Note that the "strings" can be
+ # binary data, but endianness is the caller's problem.)
+ #
+ # The name to use for the function is caller-specified, but its signature
+ # is always "int f(const void *key, size_t keylen)".  The caller may
+ # prepend "static " to the result string if it wants a static function.
+ #
+ # If $ci is true, the function is case-insensitive, for the limited idea
+ # of case-insensitivity explained above.
+ #
+ sub generate_hash_function
+ {
+ 	my ($keys_ref, $funcname, $ci) = @_;
+ 
+ 	# It's not worth passing this around as a parameter; just use a global.
+ 	$case_insensitive = $ci;
+ 
+ 	# Try different hash function parameters until we find a set that works
+ 	# for these keys.  In principle we might need to change multipliers,
+ 	# but these two multipliers are chosen to be cheap to calculate via
+ 	# shift-and-add, so don't change them except at great need.
+ 	my $hash_mult1 = 31;
+ 	my $hash_mult2 = 37;
+ 
+ 	# We just try successive hash seed values until we find one that works.
+ 	# (Commonly, random seeds are tried, but we want reproducible results
+ 	# from this program so we don't do that.)
+ 	my $hash_seed;
+ 	my @subresult;
+ 	for ($hash_seed = 0;; $hash_seed++)
+ 	{
+ 		@subresult =
+ 		  _construct_hash_table($keys_ref, $hash_mult1, $hash_mult2,
+ 			$hash_seed);
+ 		last if @subresult;
+ 	}
+ 
+ 	# Extract info from the function result array.
+ 	my $elemtype = $subresult[0];
+ 	my @hashtab  = @{ $subresult[1] };
+ 	my $nhash    = scalar(@hashtab);
+ 
+ 	# OK, construct the hash function definition including the hash table.
+ 	my $f = '';
+ 	$f .= sprintf "int\n";
+ 	$f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname;
+ 	$f .= sprintf "\tstatic const %s h[%d] = {\n", $elemtype, $nhash;
+ 	for (my $i = 0; $i < $nhash; $i++)
+ 	{
+ 		$f .= sprintf "%s%6d,%s",
+ 		  ($i % 8 == 0 ? "\t\t" : " "),
+ 		  $hashtab[$i],
+ 		  ($i % 8 == 7 ? "\n" : "");
+ 	}
+ 	$f .= sprintf "\n" if ($nhash % 8 != 0);
+ 	$f .= sprintf "\t};\n\n";
+ 	$f .= sprintf "\tconst unsigned char *k = key;\n";
+ 	$f .= sprintf "\tuint32\t\ta = %d;\n",   $hash_seed;
+ 	$f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed;
+ 	$f .= sprintf "\twhile (keylen--)\n\t{\n";
+ 	$f .= sprintf "\t\tunsigned char c = *k++";
+ 	$f .= sprintf " | 0x20" if $case_insensitive;    # see comment below
+ 	$f .= sprintf ";\n\n";
+ 	$f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1;
+ 	$f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2;
+ 	$f .= sprintf "\t}\n";
+ 	$f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash;
+ 	$f .= sprintf "}\n";
+ 
+ 	return $f;
+ }
+ 
+ 
+ # Calculate a hash function as the run-time code will do.
+ #
+ # If we are making a case-insensitive hash function, we implement that
+ # by OR'ing 0x20 into each byte of the key.  This correctly transforms
+ # upper-case ASCII into lower-case ASCII, while not changing digits or
+ # dollar signs.  (It does change '_', else we could just skip adjusting
+ # $cn here at all, for typical keyword strings.)
+ sub _calc_hash
+ {
+ 	my ($key, $mult, $seed) = @_;
+ 
+ 	my $result = $seed;
+ 	for my $c (split //, $key)
+ 	{
+ 		my $cn = ord($c);
+ 		$cn |= 0x20 if $case_insensitive;
+ 		$result = ($result * $mult + $cn) % 4294967296;
+ 	}
+ 	return $result;
+ }
+ 
+ 
+ # Attempt to construct a mapping table for a minimal perfect hash function
+ # for the given keys, using the specified hash parameters.
+ #
+ # Returns an array containing the mapping table element type name as the
+ # first element, and a ref to an array of the table values as the second.
+ #
+ # Returns an empty array on failure; then caller should choose different
+ # hash parameter(s) and try again.
+ sub _construct_hash_table
+ {
+ 	my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed) = @_;
+ 	my @keys = @{$keys_ref};
+ 
+ 	# This algorithm is based on a graph whose edges correspond to the
+ 	# keys and whose vertices correspond to entries of the mapping table.
+ 	# A key edge links the two vertices whose indexes are the outputs of
+ 	# the two hash functions for that key.  For K keys, the mapping
+ 	# table must have at least 2*K+1 entries, guaranteeing that there's at
+ 	# least one unused entry.  (In principle, larger mapping tables make it
+ 	# easier to find a workable hash and increase the number of inputs that
+ 	# can be rejected due to touching unused hashtable entries.  In practice,
+ 	# neither effect seems strong enough to justify using a larger table.)
+ 	my $nedges = scalar @keys;       # number of edges
+ 	my $nverts = 2 * $nedges + 1;    # number of vertices
+ 
+ 	# Initialize the array of edges.
+ 	my @E = ();
+ 	foreach my $kw (@keys)
+ 	{
+ 		# Calculate hashes for this key.
+ 		# The hashes are immediately reduced modulo the mapping table size.
+ 		my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed) % $nverts;
+ 		my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed) % $nverts;
+ 
+ 		# If the two hashes are the same for any key, we have to fail
+ 		# since this edge would itself form a cycle in the graph.
+ 		return () if $hash1 == $hash2;
+ 
+ 		# Add the edge for this key.
+ 		push @E, { left => $hash1, right => $hash2 };
+ 	}
+ 
+ 	# Initialize the array of vertices, giving them all empty lists
+ 	# of associated edges.  (The lists will be hashes of edge numbers.)
+ 	my @V = ();
+ 	for (my $v = 0; $v < $nverts; $v++)
+ 	{
+ 		push @V, { edges => {} };
+ 	}
+ 
+ 	# Insert each edge in the lists of edges using its vertices.
+ 	for (my $e = 0; $e < $nedges; $e++)
+ 	{
+ 		my $v = $E[$e]{left};
+ 		$V[$v]{edges}->{$e} = 1;
+ 
+ 		$v = $E[$e]{right};
+ 		$V[$v]{edges}->{$e} = 1;
+ 	}
+ 
+ 	# Now we attempt to prove the graph acyclic.
+ 	# A cycle-free graph is either empty or has some vertex of degree 1.
+ 	# Removing the edge attached to that vertex doesn't change this property,
+ 	# so doing that repeatedly will reduce the size of the graph.
+ 	# If the graph is empty at the end of the process, it was acyclic.
+ 	# We track the order of edge removal so that the next phase can process
+ 	# them in reverse order of removal.
+ 	my @output_order = ();
+ 
+ 	# Consider each vertex as a possible starting point for edge-removal.
+ 	for (my $startv = 0; $startv < $nverts; $startv++)
+ 	{
+ 		my $v = $startv;
+ 
+ 		# If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
+ 		# remove that edge, and then consider the edge's other vertex to see
+ 		# if it is now of degree 1.  The inner loop repeats until reaching a
+ 		# vertex not of degree 1.
+ 		while (scalar(keys(%{ $V[$v]{edges} })) == 1)
+ 		{
+ 			# Unlink its only edge.
+ 			my $e = (keys(%{ $V[$v]{edges} }))[0];
+ 			delete($V[$v]{edges}->{$e});
+ 
+ 			# Unlink the edge from its other vertex, too.
+ 			my $v2 = $E[$e]{left};
+ 			$v2 = $E[$e]{right} if ($v2 == $v);
+ 			delete($V[$v2]{edges}->{$e});
+ 
+ 			# Push e onto the front of the output-order list.
+ 			unshift @output_order, $e;
+ 
+ 			# Consider v2 on next iteration of inner loop.
+ 			$v = $v2;
+ 		}
+ 	}
+ 
+ 	# We succeeded only if all edges were removed from the graph.
+ 	return () if (scalar(@output_order) != $nedges);
+ 
+ 	# OK, build the hash table of size $nverts.
+ 	my @hashtab = (0) x $nverts;
+ 	# We need a "visited" flag array in this step, too.
+ 	my @visited = (0) x $nverts;
+ 
+ 	# The idea is that for any key, the sum of the hash table entries
+ 	# for its first and second hash values is the desired output (i.e., the
+ 	# key number).  By assigning hash table values in the selected edge
+ 	# order, we can guarantee that that's true.
+ 	foreach my $e (@output_order)
+ 	{
+ 		my $l = $E[$e]{left};
+ 		my $r = $E[$e]{right};
+ 		if (!$visited[$l])
+ 		{
+ 			# $hashtab[$r] might be zero, or some previously assigned value.
+ 			$hashtab[$l] = $e - $hashtab[$r];
+ 		}
+ 		else
+ 		{
+ 			die "oops, doubly used hashtab entry" if $visited[$r];
+ 			# $hashtab[$l] might be zero, or some previously assigned value.
+ 			$hashtab[$r] = $e - $hashtab[$l];
+ 		}
+ 		# Now freeze both of these hashtab entries.
+ 		$visited[$l] = 1;
+ 		$visited[$r] = 1;
+ 	}
+ 
+ 	# Detect range of values needed in hash table.
+ 	my $hmin = $nedges;
+ 	my $hmax = 0;
+ 	for (my $v = 0; $v < $nverts; $v++)
+ 	{
+ 		$hmin = $hashtab[$v] if $hashtab[$v] < $hmin;
+ 		$hmax = $hashtab[$v] if $hashtab[$v] > $hmax;
+ 	}
+ 
+ 	# Choose width of hashtable entries.  In addition to the actual values,
+ 	# we need to be able to store a flag for unused entries, and we wish to
+ 	# have the property that adding any other entry value to the flag gives
+ 	# an out-of-range result (>= $nedges).
+ 	my $elemtype;
+ 	my $unused_flag;
+ 
+ 	if (   $hmin >= -0x7F
+ 		&& $hmax <= 0x7F
+ 		&& $hmin + 0x7F >= $nedges)
+ 	{
+ 		# int8 will work
+ 		$elemtype    = 'int8';
+ 		$unused_flag = 0x7F;
+ 	}
+ 	elsif ($hmin >= -0x7FFF
+ 		&& $hmax <= 0x7FFF
+ 		&& $hmin + 0x7FFF >= $nedges)
+ 	{
+ 		# int16 will work
+ 		$elemtype    = 'int16';
+ 		$unused_flag = 0x7FFF;
+ 	}
+ 	elsif ($hmin >= -0x7FFFFFFF
+ 		&& $hmax <= 0x7FFFFFFF
+ 		&& $hmin + 0x3FFFFFFF >= $nedges)
+ 	{
+ 		# int32 will work
+ 		$elemtype    = 'int32';
+ 		$unused_flag = 0x3FFFFFFF;
+ 	}
+ 	else
+ 	{
+ 		die "hash table values too wide";
+ 	}
+ 
+ 	# Set any unvisited hashtable entries to $unused_flag.
+ 	for (my $v = 0; $v < $nverts; $v++)
+ 	{
+ 		$hashtab[$v] = $unused_flag if !$visited[$v];
+ 	}
+ 
+ 	return ($elemtype, \@hashtab);
+ }
+ 
+ 1;
diff --git a/src/tools/gen_keywordlist.pl b/src/tools/gen_keywordlist.pl
index d764aff..e912c3e 100644
*** a/src/tools/gen_keywordlist.pl
--- b/src/tools/gen_keywordlist.pl
***************
*** 14,19 ****
--- 14,25 ----
  # variable named according to the -v switch ("ScanKeywords" by default).
  # The variable is marked "static" unless the -e switch is given.
  #
+ # ScanKeywordList uses hash-based lookup, so this script also selects
+ # a minimal perfect hash function for the keyword set, and emits a
+ # static hash function that is referenced in the ScanKeywordList struct.
+ # The hash function is case-insensitive unless --case is specified.
+ # Note that case insensitivity assumes all-ASCII keywords!
+ #
  #
  # Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  # Portions Copyright (c) 1994, Regents of the University of California
***************
*** 25,39 ****
  use strict;
  use warnings;
  use Getopt::Long;
  
  my $output_path = '';
  my $extern = 0;
  my $varname = 'ScanKeywords';
  
  GetOptions(
! 	'output:s' => \$output_path,
! 	'extern'   => \$extern,
! 	'varname:s' => \$varname) || usage();
  
  my $kw_input_file = shift @ARGV || die "No input file.\n";
  
--- 31,48 ----
  use strict;
  use warnings;
  use Getopt::Long;
+ use PerfectHash;
  
  my $output_path = '';
  my $extern = 0;
+ my $case_sensitive = 0;
  my $varname = 'ScanKeywords';
  
  GetOptions(
! 	'output:s'       => \$output_path,
! 	'extern'         => \$extern,
! 	'case-sensitive' => \$case_sensitive,
! 	'varname:s'      => \$varname) || usage();
  
  my $kw_input_file = shift @ARGV || die "No input file.\n";
  
*************** while (<$kif>)
*** 87,93 ****
--- 96,117 ----
  	}
  }
  
+ # When being case-insensitive, insist that the input be all-lower-case.
+ if (!$case_sensitive)
+ {
+ 	foreach my $kw (@keywords)
+ 	{
+ 		die qq|The keyword "$kw" is not lower-case in $kw_input_file\n|
+ 		  if ($kw ne lc $kw);
+ 	}
+ }
+ 
  # Error out if the keyword names are not in ASCII order.
+ #
+ # While this isn't really necessary with hash-based lookup, it's still
+ # helpful because it provides a cheap way to reject duplicate keywords.
+ # Also, insisting on sorted order ensures that code that scans the keyword
+ # table linearly will see the keywords in a canonical order.
  for my $i (0..$#keywords - 1)
  {
  	die qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file\n|
*************** print $kwdef "};\n\n";
*** 128,142 ****
  
  printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords;
  
  # Emit the struct that wraps all this lookup info into one variable.
  
! print $kwdef "static " if !$extern;
  printf $kwdef "const ScanKeywordList %s = {\n", $varname;
  printf $kwdef qq|\t%s_kw_string,\n|, $varname;
  printf $kwdef qq|\t%s_kw_offsets,\n|, $varname;
  printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname;
  printf $kwdef qq|\t%d\n|, $max_len;
! print $kwdef "};\n\n";
  
  printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;
  
--- 152,176 ----
  
  printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords;
  
+ # Emit the definition of the hash function.
+ 
+ my $funcname = $varname . "_hash_func";
+ 
+ my $f = PerfectHash::generate_hash_function(\@keywords,
+ 	$funcname, !$case_sensitive);
+ 
+ printf $kwdef qq|static %s\n|, $f;
+ 
  # Emit the struct that wraps all this lookup info into one variable.
  
! printf $kwdef "static " if !$extern;
  printf $kwdef "const ScanKeywordList %s = {\n", $varname;
  printf $kwdef qq|\t%s_kw_string,\n|, $varname;
  printf $kwdef qq|\t%s_kw_offsets,\n|, $varname;
+ printf $kwdef qq|\t%s,\n|, $funcname;
  printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname;
  printf $kwdef qq|\t%d\n|, $max_len;
! printf $kwdef "};\n\n";
  
  printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;
  
*************** Usage: gen_keywordlist.pl [--output/-o <
*** 148,153 ****
--- 182,188 ----
      --output   Output directory (default '.')
      --varname  Name for ScanKeywordList variable (default 'ScanKeywords')
      --extern   Allow the ScanKeywordList variable to be globally visible
+     --case     Keyword matching is to be case-sensitive
  
  gen_keywordlist.pl transforms a list of keywords into a ScanKeywordList.
  The output filename is derived from the input file by inserting _d,
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 937bf18..8f54e45 100644
*** a/src/tools/msvc/Solution.pm
--- b/src/tools/msvc/Solution.pm
*************** sub GenerateFiles
*** 414,420 ****
  			'src/include/parser/kwlist.h'))
  	{
  		print "Generating kwlist_d.h...\n";
! 		system('perl src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h');
  	}
  
  	if (IsNewer(
--- 414,420 ----
  			'src/include/parser/kwlist.h'))
  	{
  		print "Generating kwlist_d.h...\n";
! 		system('perl -I src/tools src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h');
  	}
  
  	if (IsNewer(
*************** sub GenerateFiles
*** 426,433 ****
  	{
  		print "Generating pl_reserved_kwlist_d.h and pl_unreserved_kwlist_d.h...\n";
  		chdir('src/pl/plpgsql/src');
! 		system('perl ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h');
! 		system('perl ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h');
  		chdir('../../../..');
  	}
  
--- 426,433 ----
  	{
  		print "Generating pl_reserved_kwlist_d.h and pl_unreserved_kwlist_d.h...\n";
  		chdir('src/pl/plpgsql/src');
! 		system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h');
! 		system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h');
  		chdir('../../../..');
  	}
  
*************** sub GenerateFiles
*** 440,447 ****
  	{
  		print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n";
  		chdir('src/interfaces/ecpg/preproc');
! 		system('perl ../../../tools/gen_keywordlist.pl --varname ScanCKeywords c_kwlist.h');
! 		system('perl ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h');
  		chdir('../../../..');
  	}
  
--- 440,447 ----
  	{
  		print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n";
  		chdir('src/interfaces/ecpg/preproc');
! 		system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanCKeywords --case c_kwlist.h');
! 		system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h');
  		chdir('../../../..');
  	}

Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)

Reply via email to