Re: inconsistency and inefficiency in setup_conversion()
On 10/2/18, Michael Paquier wrote: > v4 does not apply anymore. I am moving this patch to next commit fest, > waiting on author. v5 attached. -John Naylor From ea0a180bde325b0383ce7f0b3d48d1ce9e941393 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 2 Jul 2018 12:52:07 +0700 Subject: [PATCH v5 1/2] Add pg_language lookup. This didn't seem worth doing before, but an upcoming commit will add 88 entries to pg_proc whose languge is 'c'. In addition, we can now use 'internal' in Gen_fmgr.pl instead of looking up the OID. This simplifies that script and its Makefile a bit. --- doc/src/sgml/bki.sgml| 6 +- src/backend/catalog/genbki.pl| 8 +++ src/backend/utils/Gen_fmgrtab.pl | 9 ++- src/backend/utils/Makefile | 8 +-- src/include/catalog/pg_proc.dat | 98 src/include/catalog/pg_proc.h| 2 +- src/tools/msvc/Solution.pm | 4 +- 7 files changed, 68 insertions(+), 67 deletions(-) diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 0fb309a1bd..176fa8bf4d 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -408,8 +408,8 @@ that's error-prone and hard to understand, so for frequently-referenced catalogs, genbki.pl provides mechanisms to write symbolic references instead. Currently this is possible for references -to access methods, functions, operators, opclasses, opfamilies, and -types. The rules are as follows: +to access methods, functions, languages, operators, opclasses, opfamilies, +and types. The rules are as follows: @@ -419,7 +419,7 @@ Use of symbolic references is enabled in a particular catalog column by attaching BKI_LOOKUP(lookuprule) to the column's definition, where lookuprule - is pg_am, pg_proc, + is pg_am, pg_proc, pg_language, pg_operator, pg_opclass, pg_opfamily, or pg_type. BKI_LOOKUP can be attached to columns of diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index 649200260a..f321662695 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -167,6 +167,13 @@ foreach my $row (@{ $catalog_data{pg_am} }) $amoids{ $row->{amname} } = $row->{oid}; } +# language OID lookup +my %langoids; +foreach my $row (@{ $catalog_data{pg_language} }) +{ + $langoids{ $row->{lanname} } = $row->{oid}; +} + # opclass OID lookup my %opcoids; foreach my $row (@{ $catalog_data{pg_opclass} }) @@ -241,6 +248,7 @@ foreach my $row (@{ $catalog_data{pg_type} }) # Map catalog name to OID lookup. my %lookup_kind = ( pg_am => \%amoids, + pg_language => \%langoids, pg_opclass => \%opcoids, pg_operator => \%operoids, pg_opfamily => \%opfoids, diff --git a/src/backend/utils/Gen_fmgrtab.pl b/src/backend/utils/Gen_fmgrtab.pl index fa30436895..b288c5e5fd 100644 --- a/src/backend/utils/Gen_fmgrtab.pl +++ b/src/backend/utils/Gen_fmgrtab.pl @@ -59,6 +59,8 @@ die "No include path; you must specify -I.\n" if !$include_path; # Note: We pass data file names as arguments and then look for matching # headers to parse the schema from. This is backwards from genbki.pl, # but the Makefile dependencies look more sensible this way. +# We currently only need pg_proc, but retain the possibility of reading +# more than one data file. my %catalogs; my %catalog_data; foreach my $datfile (@input_files) @@ -78,13 +80,10 @@ foreach my $datfile (@input_files) $catalog_data{$catname} = Catalog::ParseData($datfile, $schema, 0); } -# Fetch some values for later. +# Fetch a value for later. my $FirstBootstrapObjectId = Catalog::FindDefinedSymbol('access/transam.h', $include_path, 'FirstBootstrapObjectId'); -my $INTERNALlanguageId = - Catalog::FindDefinedSymbolFromData($catalog_data{pg_language}, - 'INTERNALlanguageId'); # Collect certain fields from pg_proc.dat. my @fmgr = (); @@ -94,7 +93,7 @@ foreach my $row (@{ $catalog_data{pg_proc} }) my %bki_values = %$row; # Select out just the rows for internal-language procedures. - next if $bki_values{prolang} ne $INTERNALlanguageId; + next if $bki_values{prolang} ne 'internal'; push @fmgr, { diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index e797539d09..da40f6b6c4 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -31,15 +31,11 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp $(top_ $(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h -FMGR_DATA := $(addprefix $(top_srcdir)/src/include/catalog/,\ - pg_language.dat pg_proc.dat \ - ) - # fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on # the timestamps of the individual output files, because the Perl script # won't update them if they didn't change (to avoid unnecessary recompiles). -fmgr-stamp: Gen_fmgrtab.pl $(catalogdir)/Catalog.pm $(FMGR_DATA) $(top_srcdir)/src/include/access/transam.h - $(PER
Re: Sync ECPG scanner with core
On 9/30/18, Michael Paquier wrote: > It would be nice to add this patch to the next commit fest: > https://commitfest.postgresql.org/20/ Done, thanks. -John Naylor
Re: Sync ECPG scanner with core
It seems the pach tester is confused by the addition of the demonstration diff file. I'm reattaching just the patchset to see if it turns green. -John Naylor From 107e3c8a0b65b0196ea4370a724c8b2a1b0fdf79 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 30 Sep 2018 12:51:41 +0700 Subject: [PATCH v1 1/4] First pass at syncing ECPG scanner with the core scanner. Adjust whitespace and formatting, clean up some comments, and move the block of whitespace rules. --- src/backend/parser/scan.l | 2 +- src/fe_utils/psqlscan.l | 2 +- src/interfaces/ecpg/preproc/pgc.l | 773 -- 3 files changed, 408 insertions(+), 369 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 950b8b8591..a2454732a1 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -192,7 +192,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * XXX perhaps \f (formfeed) should be treated as a newline as well? * * XXX if you change the set of whitespace characters, fix scanner_isspace() - * to agree, and see also the plpgsql lexer. + * to agree. */ space [ \t\n\r\f] diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l index fdf49875a7..25253b54ea 100644 --- a/src/fe_utils/psqlscan.l +++ b/src/fe_utils/psqlscan.l @@ -151,7 +151,7 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); * XXX perhaps \f (formfeed) should be treated as a newline as well? * * XXX if you change the set of whitespace characters, fix scanner_isspace() - * to agree, and see also the plpgsql lexer. + * to agree. */ space [ \t\n\r\f] diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index 0792118cfe..b96f17ca20 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -108,16 +108,19 @@ static struct _if_value * We use exclusive states for quoted strings, extended comments, * and to eliminate parsing troubles for numeric strings. * Exclusive states: - * bit string literal - * extended C-style comments in C - * extended C-style comments in SQL - * delimited identifiers (double-quoted identifiers) - thomas 1997-10-27 - * hexadecimal numeric string - thomas 1997-11-16 - * standard quoted strings - thomas 1997-07-30 - * standard quoted strings in C - michael - * extended quoted strings (support backslash escape sequences) - * national character quoted strings + * bit string literal + * extended C-style comments in C + * extended C-style comments in SQL + * delimited identifiers (double-quoted identifiers) + * + * hexadecimal numeric string + * standard quoted strings + * extended quoted strings (support backslash escape sequences) + * national character quoted strings + * standard quoted strings in C * $foo$ quoted strings + * + * * quoted identifier with Unicode escapes * quoted string with Unicode escapes */ @@ -138,6 +141,48 @@ static struct _if_value %x xui %x xus +/* + * In order to make the world safe for Windows and Mac clients as well as + * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n + * sequence will be seen as two successive newlines, but that doesn't cause + * any problems. SQL-style comments, which start with -- and extend to the + * next newline, are treated as equivalent to a single whitespace character. + * + * NOTE a fine point: if there is no newline following --, we will absorb + * everything to the end of the input as a comment. This is correct. Older + * versions of Postgres failed to recognize -- as a comment if the input + * did not end with a newline. + * + * XXX perhaps \f (formfeed) should be treated as a newline as well? + * + * XXX if you change the set of whitespace characters, fix ecpg_isspace() + * to agree. + */ + +space [ \t\n\r\f] +horiz_space [ \t\f] +newline [\n\r] +non_newline [^\n\r] + +comment ("--"{non_newline}*) + +whitespace ({space}+|{comment}) + +/* + * SQL requires at least one newline in the whitespace separating + * string literals that are to be concatenated. Silly, but who are we + * to argue? Note that {whitespace_with_newline} should not have * after + * it, whereas {whitespace} should generally have a * after it... + */ + +horiz_whitespace ({horiz_space}|{comment}) +whitespace_with_newline ({horiz_whitespace}*{newline}{whitespace}*) + +quote ' +quotestop {quote}{whitespace}* +quotecontinue {quote}{whitespace_with_newline}{quote} +quotefail {quote}{whitespace}*"-" + /* Bit string */ xbstart [bB]{quote} @@ -216,17 +261,17 @@ xdcinside ({xdcqq}|{xdcqdq}|{xdcother}) * The "extended comment" syntax closely resembles allowable operator syntax. * The tricky part here is to get lex to recognize a string starting with * slash-star as a comment, when interpreting it as an operator would produce - * a longer match --- remember lex will prefer a longer match!
Re: WIP: Avoid creation of the free space map for small tables
On 10/6/18, Thomas Munro wrote: > On Sat, Oct 6, 2018 at 7:47 AM John Naylor wrote: >> A while back, Robert Haas noticed that the space taken up by very >> small tables is dominated by the FSM [1]. Tom suggested that we could >> prevent creation of the FSM until the heap has reached a certain >> threshold size [2]. Attached is a WIP patch to implement that. I've >> also attached a SQL script to demonstrate the change in behavior for >> various scenarios. > > Hi John, > > You'll need to tweak the test in contrib/pageinspect/sql/page.sql, > because it's currently asserting that there is an FSM on a small table > so make check-world fails. Whoops, sorry about that; the attached patch passes make check-world. While looking into that, I also found a regression: If the cached target block is the last block in the relation and there is no free space, that block will be tried twice. That's been fixed as well. Thanks, -John Naylor From 49c80647737edfaae82014f2c94a84114f3688f6 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 7 Oct 2018 21:34:23 +0700 Subject: [PATCH v2] Avoid creation of the free space map for small tables. The FSM isn't created if the heap has fewer than 10 blocks. If the last known good block has insufficient space, try every block before extending the heap. If a heap with a FSM is truncated back to below the threshold, the FSM stays around and can be used as usual. If the heap tuples are all deleted, the FSM stays but has no leaf blocks (same as before). Although it exists, it won't be re-extended until the heap passes the threshold again. --- contrib/pageinspect/expected/page.out | 77 +- contrib/pageinspect/sql/page.sql | 33 +--- src/backend/access/heap/hio.c | 96 +-- src/backend/storage/freespace/freespace.c | 35 - src/include/storage/freespace.h | 4 + 5 files changed, 169 insertions(+), 76 deletions(-) diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3fcd9fbe6d..83e5910453 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -1,48 +1,69 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); -VACUUM test1; -- set up FSM +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT g from generate_series(1,1) g; +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; main_0 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; -ERROR: block number 1 is out of range for relation "test1" -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; +ERROR: block number 100 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; fsm_0 --- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; - fsm_1 - 8192 -(1 row) - -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10; +ERROR: block number 10 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; vm_0 -- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; -ERROR: block number 1 is out of range for relation "test1" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1; +ERROR: block number 1 is out of range for relation "test_rel_forks" SELECT octet_length(get_raw_page('xxx', 'main', 0)); ERROR: relation "xxx" does not exist -SELECT octet_length(get_raw_page('test1', 'xxx', 0)); +SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0)); ERROR: invalid fork name HINT: Valid fork names are "main", "fsm", "vm", and "init". -SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0); +SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0)); + fsm_page_contents +--- + 0: 192 + + 1: 192 + + 3: 192 + + 7: 192 + + 15: 192 + + 31: 192 + + 63: 192 + + 127: 192 + + 255: 192 + + 511: 192 + + 1023: 192+ + 2047: 192+ + 4095: 192+ + fp_next_slot: 0 + + +(1 row) + +SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('
Re: generating bootstrap entries for array types
On 9/20/18, Dagfinn Ilmari Mannsåker wrote: > Hi again John, Hi Ilmari, My apologies -- your messages ended up in my spam folder. I only saw this thread again when Tom replied. Thanks for the review! I'll keep a look out for any future messages. -John Naylor
Re: generating bootstrap entries for array types
On 9/21/18, Tom Lane wrote: > I did some other minor hacking (mostly, fixing the documentation) > and pushed it. Thank you. -John Naylor
Re: doc - add missing documentation for "acldefault"
On 9/19/18, Tom Lane wrote: > However, I don't object to documenting any function that has its > own pg_description string. Speaking of, that description string seems to have been neglected. I've attached a remedy for that. -John Naylor diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 860571440a..6b7088b9ce 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -2073,7 +2073,7 @@ { oid => '1365', descr => 'make ACL item', proname => 'makeaclitem', prorettype => 'aclitem', proargtypes => 'oid oid text bool', prosrc => 'makeaclitem' }, -{ oid => '3943', descr => 'TODO', +{ oid => '3943', descr => 'show default privileges, for use by the information schema', proname => 'acldefault', prorettype => '_aclitem', proargtypes => 'char oid', prosrc => 'acldefault_sql' }, { oid => '1689',
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/27/18, Tom Lane wrote: > diff --git a/src/tools/gen_keywords.pl b/src/tools/gen_keywords.pl > + elsif ($arg =~ /^-o/) > + { > + $output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; > + } > > My perl-fu is not great, but it looks like this will accept arguments > like "-ofilename", which is a style I don't like at all. I'd rather > either insist on the filename being separate or write the switch like > "-o=filename". Also, project style when taking both forms is usually > more like > -o filename > --offset=filename This style was cargo-culted from the catalog scripts. I can settle on just the first form if you like. > +$kw_input_file =~ /((\w*)kwlist)\.h/; > +my $base_filename = $1; > +$prefix = $2 if !defined $prefix; > > Hmm, what happens if the input filename does not end with "kwlist.h"? If that's a maintainability hazard, I can force every invocation to provide a prefix instead. > I looked very briefly at v4-0002, and I'm not very convinced about > the "middle" aspect of that optimization. It seems unmaintainable, > plus you've not exhibited how the preferred keywords would get selected > in the first place (wiring them into the Perl script is surely not > acceptable). What if the second argument of the macro held this info? Something like: PG_KEYWORD("security", FULL_SEARCH, UNRESERVED_KEYWORD) PG_KEYWORD("select", OPTIMIZE, SELECT, RESERVED_KEYWORD) with a warning emitted if more than one keyword per range has OPTIMIZE. That would require all keyword lists to have that second argument, but selecting a preferred keyword would be optional. -John Naylor
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/26/18, John Naylor wrote: > On 12/26/18, Robert Haas wrote: >> I wonder if we could do something really simple like a lookup based on >> the first character of the scan keyword. It looks to me like there are >> 440 keywords right now, and the most common starting letter is 'c', >> which is the first letter of 51 keywords. So dispatching based on the >> first letter clips at least 3 steps off the binary search. I don't >> know whether that's enough to be worthwhile, but it's probably pretty >> simple to implement. > I agree it'd be fairly simple to do, and might raise the bar for doing > anything more complex. I went ahead and did this for v4, but split out into a separate patch. In addition, I used a heuristic to bypass binary search for the most common keywords. Normally, the middle value is computed mathematically, but I found that in each range of keywords beginning with the same letter, there is often 1 or 2 common keywords that are good first guesses, such as select, from, join, limit, where. I taught the lookup to try those first, and then compute subsequent steps the usual way. Barring additional bikeshedding on 0001, I'll plan on implementing offset-based lookup for the other keyword types and retire the old ScanKeyword. Once that's done, we can benchmark and compare with the optimizations in 0002. -John Naylor From ed78318df69bd3fa1543f3dc04d0a6ca9794a81c Mon Sep 17 00:00:00 2001 From: John Naylor Date: Wed, 26 Dec 2018 21:28:27 -0500 Subject: [PATCH v4 1/2] WIP Use offset-based keyword lookup. (For WIP, only for plpgsql unreserved keywords) Replace binary search over an array of ScanKeyword structs with a binary search over an array of offsets into a keyword string. Access auxillary data only after a keyword hit. This has better locality of reference and a smaller memory footprint. --- src/common/keywords.c | 55 + src/include/common/keywords.h | 12 ++ src/pl/plpgsql/src/.gitignore | 1 + src/pl/plpgsql/src/Makefile | 13 +- src/pl/plpgsql/src/pl_scanner.c | 128 +--- src/pl/plpgsql/src/pl_unreserved_kwlist.h | 108 + src/tools/gen_keywords.pl | 137 ++ src/tools/msvc/Solution.pm| 10 ++ 8 files changed, 361 insertions(+), 103 deletions(-) create mode 100644 src/pl/plpgsql/src/pl_unreserved_kwlist.h create mode 100644 src/tools/gen_keywords.pl diff --git a/src/common/keywords.c b/src/common/keywords.c index 0c0c794c68..b0e5a721b6 100644 --- a/src/common/keywords.c +++ b/src/common/keywords.c @@ -112,3 +112,58 @@ ScanKeywordLookup(const char *text, return NULL; } + +/* Like ScanKeywordLookup, but uses offsets into a keyword string. */ +int +ScanKeywordLookupOffset(const char *string_to_lookup, + const char *kw_strings, + const uint16 *kw_offsets, + int num_keywords) +{ + int len, +i; + char word[NAMEDATALEN]; + const uint16 *low; + const uint16 *high; + + len = strlen(string_to_lookup); + /* We assume all keywords are shorter than NAMEDATALEN. */ + if (len >= NAMEDATALEN) + return -1; + + /* + * Apply an ASCII-only downcasing. We must not use tolower() since it may + * produce the wrong translation in some locales (eg, Turkish). + */ + for (i = 0; i < len; i++) + { + char ch = string_to_lookup[i]; + + if (ch >= 'A' && ch <= 'Z') + ch += 'a' - 'A'; + word[i] = ch; + } + word[len] = '\0'; + + /* + * Now do a binary search using plain strcmp() comparison. + */ + low = kw_offsets; + high = kw_offsets + (num_keywords - 1); + while (low <= high) + { + const uint16 *middle; + int difference; + + middle = low + (high - low) / 2; + difference = strcmp(kw_strings + *middle, word); + if (difference == 0) + return middle - kw_offsets; + else if (difference < 0) + low = middle + 1; + else + high = middle - 1; + } + + return -1; +} diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h index 0b31505b66..201d0fcc7a 100644 --- a/src/include/common/keywords.h +++ b/src/include/common/keywords.h @@ -28,6 +28,13 @@ typedef struct ScanKeyword int16 category; /* see codes above */ } ScanKeyword; +/* Payload data for keywords */ +typedef struct ScanKeywordAux +{ + int16 value; /* grammar's token code */ + char category; /* see codes above */ +} ScanKeywordAux; + #ifndef FRONTEND extern PGDLLIMPORT const ScanKeyword ScanKeywords[]; extern PGDLLIMPORT const int NumScanKeywords; @@ -41,4 +48,9 @@ extern const ScanKeyword *ScanKeywordLookup(const char *text, const ScanKeyword *keywords, int num_keywords); +int ScanKeywordLookupOffset(const char *string_to_lookup, + const char *kw_strings, + const uint16 *kw_offsets, + int num_keywords); + #endif /* KEYWORDS_H */ diff --git a/src/pl/plpgsql/src/.gitignore b/src/pl/plpgsql/src/.gitignore
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
I think 0001 with complete keyword lookup replacement is in decent enough shape to post. Make check-world passes. A few notes and caveats: -I added an --extern option to the script for the core keyword headers. This also capitalizes variables. -ECPG keyword lookup is a bit different in that the ecpg and sql lookup functions are wrapped in a single function rather than called separately within pgc.l. It might be worth untangling that, but I have not done so. -Some variable names haven't changed even though now they're only referring to token values, which might be confusing. -I haven't checked if I need to install the generated headers. -I haven't measured performance or binary size. If anyone is excited enough to do that, great, otherwise I'll do that as time permits. -There are probably makefile bugs. Now, on to previous review points: On 12/27/18, Tom Lane wrote: > +/* Payload data for keywords */ > +typedef struct ScanKeywordAux > +{ > + int16 value; /* grammar's token code */ > + charcategory; /* see codes above */ > +} ScanKeywordAux; > > There isn't really any point in changing category to "char", because > alignment considerations will mandate that sizeof(ScanKeywordAux) be > a multiple of 2 anyway. With some compilers we could get around that > with a pragma to force non-aligned storage, but doing so would be a > net loss on most non-Intel architectures. Reverted, especially since we can skip the struct entirely for some callers as you pointed out below. > diff --git a/src/pl/plpgsql/src/.gitignore b/src/pl/plpgsql/src/.gitignore > @@ -1,3 +1,4 @@ > +/*kwlist_d.h > > Not a fan of using wildcards in .gitignore files, at least not when > there's just one or two files you intend to match. Removed. > # Force these dependencies to be known even without dependency info built: > -pl_gram.o pl_handler.o pl_comp.o pl_exec.o pl_funcs.o pl_scanner.o: > plpgsql.h pl_gram.h plerrcodes.h > +pl_gram.o pl_handler.o pl_comp.o pl_exec.o pl_funcs.o pl_scanner.o: > plpgsql.h pl_gram.h plerrcodes.h pl_unreserved_kwlist_d.h > > Hm, do we really need any more than pl_scanner.o to depend on that header? I think you're right, so separated into a new rule. > +# Parse keyword header for names. > +my @keywords; > +while (<$kif>) > +{ > + if (/^PG_KEYWORD\("(\w+)",\s*\w+,\s*\w+\)/) > > This is assuming more than it should about the number of arguments for > PG_KEYWORD, as well as what's in them. I think it'd be sufficient to > match like this: > > if (/^PG_KEYWORD\("(\w+)",/) ...and... > diff --git a/src/pl/plpgsql/src/pl_unreserved_kwlist.h > b/src/pl/plpgsql/src/pl_unreserved_kwlist.h > +/* name, value, category */ > +PG_KEYWORD("absolute", K_ABSOLUTE, UNRESERVED_KEYWORD) > > Likewise, I'd just have these be two-argument macros. There's no reason > for the various kwlist.h headers to agree on the number of payload > arguments for PG_KEYWORD. Both done, however... > +/* FIXME: Have to redefine this symbol for the WIP. */ > +#undef PG_KEYWORD > +#define PG_KEYWORD(kwname, value, category) {value, category}, > + > +static const ScanKeywordAux unreserved_keywords[] = { > +#include "pl_unreserved_kwlist.h" > }; > > The category isn't useful for this keyword list, so couldn't you > just make this an array of uint16 values? Yes, this works for the unreserved keywords. The reserved ones still need the aux struct to work with the core scanner, even though scan.l doesn't reference category either. This has the consequence that we can't dispense with category, e.g.: PG_KEYWORD("all", K_ALL, RESERVED_KEYWORD) ...unless we do without the struct entirely, but that's not without disadvantages as you mentioned. I decided to export the struct (rather than just int16 for category) to the frontend, even though we have to set the token values to zero, since there might someday be another field of use to the frontend. Also to avoid confusion. > I don't mind allowing the prefix to default to empty. What I was > concerned about was that base_filename could end up undefined. > Probably the thing to do is to generate base_filename separately, > say by stripping any initial ".*/" sequence and then substitute > '_' for '.'. I removed assumptions about the filename. > +Options: > +-o output path > +-p optional prefix for generated data structures > > This usage message is pretty vague about how you write the options > (cf gripe above). I tried it like this: Usage: gen_keywords.pl [--output/-o ] [--prefix/-p ] input_file --output Output directory --prefix String prepended to var names in the output file On 12/27/18, Andrew Dunstan wrote
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Jan 16, 2019 at 8:41 AM Amit Kapila wrote: > > On Fri, Jan 11, 2019 at 3:54 AM John Naylor > wrote: > 1. > Commit message: > > Any pages with wasted free space become visible at next relation extension, > > so we still control table bloat. > > I think the free space will be available after the next pass of > vacuum, no? How can relation extension make it available? To explain, this diagram shows the map as it looks for different small table sizes: 0123 A NA ANA NANA So for a 3-block table, the alternating strategy never checks block 1. Any free space block 1 has acquired via delete-and-vacuum will become visible if it extends to 4 blocks. We are accepting a small amount of bloat for improved performance, as discussed. Would it help to include this diagram in the README? > 2. > +2. For very small heap relations, the FSM would be relatively large and > +wasteful, so as of PostgreSQL 12 we refrain from creating the FSM for > +heaps with HEAP_FSM_CREATION_THRESHOLD pages or fewer, both to save space > +and to improve performance. To locate free space in this case, we simply > +iterate over the heap, trying alternating pages in turn. There may be some > +wasted free space in this case, but it becomes visible again upon next > +relation extension. > > a. Again, how space becomes available at next relation extension. > b. I think there is no use of mentioning the version number in the > above comment, this code will be present from PG-12, so one can find > out from which version this optimization is added. It fits with the reference to PG 8.4 earlier in the document. I chose to be consistent, but to be honest, I'm not much in favor of a lot of version references in code/READMEs. > 3. > BlockNumber > RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, > Size oldSpaceAvail, Size spaceNeeded) > { > .. > + /* First try the local map, if it exists. */ > + if (oldPage < fsm_local_map.nblocks) > + { > .. > } > > The comment doesn't appear to be completely in sync with the code. > Can't we just check whether "fsm_local_map.nblocks > 0", if so, we > can use a macro for the same? I have changed this in the attached > patch, see what you think about it. I have used it at a few other > places as well. The macro adds clarity, so I'm in favor of using it. > 4. > + * When we initialize the map, the whole heap is potentially available to > + * try. If a caller wanted to reset the map after another backend extends > + * the relation, this will only flag new blocks as available. No callers > + * do this currently, however. > + */ > +static void > +fsm_local_set(Relation rel, BlockNumber curr_nblocks) > { > .. > + if (blkno >= fsm_local_map.nblocks + 2) > .. > } > > > The way you have tried to support the case as quoted in the comment > "If a caller wanted to reset the map after another backend extends .." > doesn't appear to be solid and I am not sure if it is correct either. I removed this case in v9 and you objected to that as unnecessary, so I reverted it for v10. > We don't have any way to test the same, so I suggest let's try to > simplify the case w.r.t current requirement of this API. I think we > should > some simple logic to try every other block like: > > + blkno = cur_nblocks - 1; > + while (true) > + { > + fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL; > + if (blkno >= 2) > + blkno -= 2; > + else > + break; > + } > > I have changed this in the attached patch. Fine by me. > 5. > +/* > + * Search the local map for an available block to try, in descending order. > + * > + * For use when there is no FSM. > + */ > +static BlockNumber > +fsm_local_search(void) > > We should give a brief explanation as to why we try in descending > order. I have added some explanation in the attached patch, see what > you think about it? > > Apart from the above, I have modified a few comments. I'll include these with some grammar corrections in the next version. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Jan 16, 2019 at 11:40 AM John Naylor wrote: On Wed, Jan 16, 2019 at 8:41 AM Amit Kapila wrote: can use a macro for the same? I have changed this in the attached patch, see what you think about it. I have used it at a few other places as well. The macro adds clarity, so I'm in favor of using it. It just occured to me that the style FSM_LOCAL_MAP_EXISTS seems more common for macros that refer to constants, and FSMLocalMapExists for expressions, but I've only seen a small amount of the code base. Do we have a style preference here, or is it more a matter of matching the surrounding code? -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: explain plans with information about (modified) gucs
> [v5] Hi Tomas, Peter suggested upthread to use 'settings' rather than 'gucs' for the explain option (and output?), and you seemed to agree. Are you going to include that in a future version? Speaking of the output, v5's default text doesn't match that of formatted text ('GUCs' / 'GUC').
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Jan 16, 2019 at 10:35 PM Amit Kapila wrote: > Yes, I think it would be good if you can explain the concept of > local-map with the help of this example. > Then let's not add a reference to the version number in this case. I Okay, done in v14. I kept your spelling of the new macro. One minor detail added: use uint8 rather than char for the local map array. This seems to be preferred, especially in this file. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services From 38bc2d9b4ae3f0d1ab6d21c54ed638c5aee6d637 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 17 Jan 2019 12:25:30 -0500 Subject: [PATCH v14 1/2] Avoid creation of the free space map for small heap relations. Previously, all heaps had FSMs. For very small tables, this means that the FSM took up more space than the heap did. This is wasteful, so now we refrain from creating the FSM for heaps with 4 pages or fewer. If the last known target block has insufficient space, we still try to insert into some other page before before giving up and extending the relation, since doing otherwise leads to table bloat. Testing showed that trying every page penalized performance slightly, so we compromise and try every other page. This way, we visit at most two pages. Any pages with wasted free space become visible at next relation extension, so we still control table bloat. As a bonus, directly attempting one or two pages can even be faster than consulting the FSM would have been. --- contrib/pageinspect/expected/page.out | 77 +++--- contrib/pageinspect/sql/page.sql | 33 +-- doc/src/sgml/storage.sgml | 13 +- src/backend/access/brin/brin.c| 2 +- src/backend/access/brin/brin_pageops.c| 10 +- src/backend/access/heap/hio.c | 47 ++-- src/backend/access/heap/vacuumlazy.c | 17 +- src/backend/access/transam/xact.c | 14 ++ src/backend/storage/freespace/README | 32 ++- src/backend/storage/freespace/freespace.c | 272 +- src/backend/storage/freespace/indexfsm.c | 6 +- src/include/storage/freespace.h | 9 +- src/test/regress/expected/fsm.out | 62 + src/test/regress/parallel_schedule| 6 + src/test/regress/serial_schedule | 1 + src/test/regress/sql/fsm.sql | 46 16 files changed, 545 insertions(+), 102 deletions(-) create mode 100644 src/test/regress/expected/fsm.out create mode 100644 src/test/regress/sql/fsm.sql diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3fcd9fbe6d..83e5910453 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -1,48 +1,69 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); -VACUUM test1; -- set up FSM +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT g from generate_series(1,1) g; +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; main_0 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; -ERROR: block number 1 is out of range for relation "test1" -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; +ERROR: block number 100 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; fsm_0 --- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; - fsm_1 - 8192 -(1 row) - -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10; +ERROR: block number 10 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; vm_0 -- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; -ERROR: block number 1 is out of range for relation "test1" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1; +ERROR: block number 1 is out of range for relation "test_rel_forks" SELECT octet_length(get_raw_page('xxx', 'main', 0)); ERROR: relation "xxx" does not exist -SELECT octet_length(get_raw_page('test1', 'xxx', 0)); +SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0)); ERROR: invalid fork name HINT: Valid fork names are "main", "fsm", "vm
Re: Delay locking partitions during INSERT and UPDATE
On 11/22/18, David Rowley wrote: > If required, such operations could LOCK TABLE the top partitioned > table to block the DML operation. There's already a risk of similar > deadlocks from such operations done on multiple separate tables when > the order they're done is not the same as the order the tables are > written in a query, although, in that case, the window for the > deadlock is likely to be much smaller. Is this something that would need documentation anywhere? > With this done, the performance of an INSERT into a 10k partition > partitioned table looks like: > > Setup: > create table hashp (a int) partition by hash(a); > select 'create table hashp'||x::Text || ' partition of hashp for > values with (modulus 1, remainder '||x::text||');' from > generate_Series(0,) x; > \gexec > > hashp_insert.sql: > \set p_a random(1,1000) > insert into hashp values(:p_a); > > Results: > $ psql -c "truncate hashp;" postgres && pgbench -n -f hashp_insert.sql > -M prepared -c 4 -j 4 -T 60 postgres I used a similar test, but with unlogged tables, and "-c 2", and got: normal table: 32000tps 10k partitions / master: 82tps 10k partitions / patch: 7000tps So far I haven't gotten quite as good performance as you and Tomas, although it's still a ~85x improvement. -John Naylor
Re: Delay locking partitions during INSERT and UPDATE
On Sat, Jan 19, 2019 at 10:59 AM Tomas Vondra wrote: > > On 1/19/19 12:05 AM, John Naylor wrote: > > I used a similar test, but with unlogged tables, and "-c 2", and got: > > > > normal table: 32000tps > > 10k partitions / master: 82tps > > 10k partitions / patch: 7000tps > > > > So far I haven't gotten quite as good performance as you and Tomas, > > although it's still a ~85x improvement. > > What hardware are you running the tests on? I wouldn't be surprised if > you were hitting some CPU or I/O bottleneck, which we're not hitting. 9 year-old laptop, Core i3. Side note, I miswrote my test parameters above -- should be "-c4 -j2". -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: Speeding up text_position_next with multibyte encodings
On Sun, Dec 23, 2018 at 9:33 AM Tomas Vondra wrote: > So, what is the expected speedup in a "good/average" case? Do we have > some reasonable real-world workload mixing these cases that could be > used as a realistic benchmark? Not sure about a realistic mix, but I investigated the tradeoffs. First, using the test function Heikki posted upthread [1], I tried a 2 character needle needle with different haystack sizes, and looked for the position where master and patch were roughly equal (average of 3, within 1%). Beyond this point, the patch regresses. To keep it simple I used UTF-8 only. haystackposition <=23 (patch always faster) 30 23 100 58 1000 ~550 100~55 For very short haystacks, the patch is always faster or regresses only when the needle is close to the end. For longer haystacks, the patch will be faster if the position searched for is less than ~55% of the way to the end, slower if after. Next, I tested finding the position of a 2 character needle in a couple positions towards the front of a 1000 character haystack, plus not present and at the end for comparison. As seen earlier [1], the worst-case regression for this haystack length was 4.4x slower for UTF-8, but that had a skip table collision, and this time I used characters with no bytes in common. Average of 3, with a loop count of 1,000,000: UTF-8 pos. masterpatch diff * 3880ms 144ms-96% 100 4440ms1410ms-68% 333 5700ms4280ms-25% 999 9370ms 12500ms 34% EUC-KR pos. master patchdiff * 3900ms 112ms-97% 100 4410ms1289ms-71% 333 5680ms3970ms-30% 999 9370ms 11600ms 24% The patch is much faster than master when the needle is near the front of a large haystack or not there at all. The majority of cases are measurably faster, and the best case is at least 20x faster. On the whole I'd say this patch is a performance win even without further optimization. I'm marking it ready for committer. [1] https://www.postgresql.org/message-id/05d95c5c-1fb7-29ea-1c5d-e7e941a0a14d%40iki.fi -- -John Naylor
Re: explain plans with information about (modified) gucs
On Sun, Jan 20, 2019 at 2:31 PM Tomas Vondra wrote: > Attached is v6 of the patch, adopting "settings" instead of "guc". Ok, looks good. I tried changing config values with the .conf file, alter system, and alter database, and tried a few queries with auto_explain. I made a pass through the config parameters, and don't see anything obviously left out. I have no comments on the source code. One thing stands out: For the docs on auto_explain, all other options have "Only superusers can change this setting.", but log_settings doesn't. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Mon, Jan 21, 2019 at 6:32 AM Amit Kapila wrote: > So we won't allow transfer of FSM files if their size is below > HEAP_FSM_CREATION_THRESHOLD. What will be its behavior in link mode? > It seems that the old files will remain there. Will it create any > problem when we try to create the files via the new server, can you > once test this case? I tried upgrading in --link mode, and on the new cluster, enlarging the table past the threshold causes a new FSM to be created as expected. > Also, another case to think in this regard is the upgrade for standby > servers, if you read below paragraph from the user manual [1], you > will see what I am worried about? > > "What this does is to record the links created by pg_upgrade's link > mode that connect files in the old and new clusters on the primary > server. It then finds matching files in the standby's old cluster and > creates links for them in the standby's new cluster. Files that were > not linked on the primary are copied from the primary to the standby. > (They are usually small.)" > > [1] - https://www.postgresql.org/docs/devel/pgupgrade.html Trying this, I ran into a couple problems. I'm probably doing something wrong, but I can't help but think there's a pg_upgrade bug/feature I'm unaware of: I set up my test to have primary directory data1 and for the secondary standby/data1. I instructed pg_upgrade to upgrade data1 into data1u, and I tried the rsync recipe in the docs quoted above, and the upgraded standby wouldn't go into recovery. While debugging that, I found surprisingly that pg_upgrade also went further and upgraded standby/data1 into standby/data1u. I tried deleting standby/data1u before running the rsync command and still nothing. Because the upgraded secondary is non-functional, I can't really answer your question. Not sure if this is normal, but the pg_upgraded new cluster no longer had the replication slot. Re-adding it didn't allow my upgraded secondary to go into recovery, either. (I made sure to copy the recovery settings, so that can't be the problem) -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
is there a reason we don't xlog index free space?
I knew that XLogRecordPageWithFreeSpace() is only called by heapam.c, but I figured indexes had their own logic to xlog free space. While investigating something else I found that brin indexes, at least, on the secondary have no FSMs. Is this deliberate? -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Jan 23, 2019 at 7:09 AM Amit Kapila wrote: > I think the first two patches (a) removal of dead code in bootstrap > and (b) the core patch to avoid creation of FSM file for the small > table are good now. I have prepared the patches along with commit > message. There is no change except for some changes in README and > commit message of the second patch. Kindly let me know what you think > about them? Good to hear! The additional language is fine. In "Once the FSM is created for heap", I would just change that to "...for a heap". > I think these two patches can go even without the upgrade patch > (during pg_upgrade, conditionally skip transfer of FSMs.) which is > still under discussion. However, I am not in a hurry if you or other > thinks that upgrade patch must be committed along with the second > patch. I think the upgrade patch is generally going on track but > might need some more review. The pg_upgrade piece is a nice-to-have feature and not essential, so can go in later. Additional review is also welcome. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Mon, Jan 21, 2019 at 6:32 AM Amit Kapila wrote: > Also, another case to think in this regard is the upgrade for standby > servers, if you read below paragraph from the user manual [1], you > will see what I am worried about? > > "What this does is to record the links created by pg_upgrade's link > mode that connect files in the old and new clusters on the primary > server. It then finds matching files in the standby's old cluster and > creates links for them in the standby's new cluster. Files that were > not linked on the primary are copied from the primary to the standby. > (They are usually small.)" > > [1] - https://www.postgresql.org/docs/devel/pgupgrade.html I am still not able to get the upgraded standby to go into recovery without resorting to pg_basebackup, but in another attempt to investigate your question I tried the following (data1 = old cluster, data2 = new cluster): mkdir -p data1 data2 standby echo 'heap' > data1/foo echo 'fsm' > data1/foo_fsm # simulate streaming replication rsync --archive data1 standby # simulate pg_upgrade, skipping FSM ln data1/foo -t data2/ rsync --archive --delete --hard-links --size-only --no-inc-recursive data1 data2 standby # result ls standby/data1 ls standby/data2 The result is that foo_fsm is not copied to standby/data2, contrary to what the docs above imply for other unlinked files. Can anyone shed light on this? -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Sat, Jan 19, 2019 at 8:06 AM Amit Kapila wrote: > > On Thu, Jan 17, 2019 at 11:13 PM John Naylor > Few more comments: > 1. > I think we should not allow to create FSM for toast tables as well > till there size reaches HEAP_FSM_CREATION_THRESHOLD. If you try below > test, you can see that FSM will be created for the toast table even if > the size of toast relation is 1 page. ... > I have fixed this in the attached patch, kindly verify it once and see > if you can add the test for same as well. Works for me. For v16, I've added and tested similar logic to pg_upgrade and verified that toast tables work the same as normal tables in recovery. I used a slightly different method to generate the long random string to avoid creating a function. Also, some cosmetic adjustments -- I changed the regression test to use 'i' instead of 'g' to match the use of generate_series in most other tests, and made capitalization more consistent. > 2. > -CREATE TABLE test1 (a int, b int); > -INSERT INTO test1 VALUES (16777217, 131584); > +CREATE TABLE test_rel_forks (a > int); > +-- Make sure there are enough blocks in the heap for the FSM to be created. > +INSERT INTO test_rel_forks SELECT g > from generate_series(1,1) g; > > -VACUUM test1; -- set up FSM > +-- set up FSM and VM > +VACUUM test_rel_forks; > > This test will create 45 pages instead of 1. I know that to create > FSM, we now need more than 4 pages, but 45 seems to be on the higher > side. I think we should not unnecessarily populate more data if there > is no particular need for it, let's restrict the number of pages to 5 > if possible. Good idea, done here and in the fsm regression test. > 3. > -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; > - fsm_1 > > - 8192 > -(1 row) > - > -SELECT octet_length > (get_raw_page('test1', 'vm', 0)) AS vm_0; > +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10; > +ERROR: block number 10 is out of range for relation "test_rel_forks" > > Why have you changed the test definition here? Previously test checks > the existing FSM page, but now it tries to access out of range page. The patch is hard to read here, but I still have a test for the existing FSM page: -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; +ERROR: block number 100 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; fsm_0 --- 8192 (1 row) I have a test for in-range and out-of-range for each relation fork. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services From ecf0cecee8b0adf8529ddb1fbdabd041172de5e3 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sat, 19 Jan 2019 18:39:39 -0500 Subject: [PATCH v16 1/2] Avoid creation of the free space map for small heap relations. Previously, all heaps had FSMs. For very small tables, this means that the FSM took up more space than the heap did. This is wasteful, so now we refrain from creating the FSM for heaps with 4 pages or fewer. If the last known target block has insufficient space, we still try to insert into some other page before before giving up and extending the relation, since doing otherwise leads to table bloat. Testing showed that trying every page penalized performance slightly, so we compromise and try every other page. This way, we visit at most two pages. Any pages with wasted free space become visible at next relation extension, so we still control table bloat. As a bonus, directly attempting one or two pages can even be faster than consulting the FSM would have been. --- contrib/pageinspect/expected/page.out | 77 +++--- contrib/pageinspect/sql/page.sql | 33 ++- doc/src/sgml/storage.sgml | 13 +- src/backend/access/brin/brin.c| 2 +- src/backend/access/brin/brin_pageops.c| 10 +- src/backend/access/heap/hio.c | 47 ++-- src/backend/access/heap/vacuumlazy.c | 17 +- src/backend/access/transam/xact.c | 14 ++ src/backend/storage/freespace/README | 33 ++- src/backend/storage/freespace/freespace.c | 275 +- src/backend/storage/freespace/indexfsm.c | 6 +- src/include/storage/freespace.h | 9 +- src/test/regress/expected/fsm.out | 75 ++ src/test/regress/parallel_schedule| 6 + src/test/regress/serial_schedule | 1 + src/test/regress/sql/fsm.sql | 55 + 16 files changed, 571 insertions(+), 102 deletions(-) create mode 100644 src/test/regress/expected/fsm.out create mode 100644 src/test/regress/sql/fsm.sql diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/
Re: WIP: Avoid creation of the free space map for small tables
On 11/24/18, Amit Kapila wrote: > 4. You have mentioned above that "system catalogs created during > bootstrap still have a FSM if they have any data." and I can also see > this behavior, have you investigated this point further? I found the cause of this. There is some special-case code in md.c to create any file if it's opened in bootstrap mode. I removed this and a similar special case (attached), and make check still passes. After digging through the history, I'm guessing this has been useless code since about 2001, when certain special catalogs were removed. -John Naylor diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 4c6a50509f..9331c8b1d1 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -310,13 +310,7 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { int save_errno = errno; - /* - * During bootstrap, there are cases where a system relation will be - * accessed (by internal backend processes) before the bootstrap - * script nominally creates it. Therefore, allow the file to exist - * already, even if isRedo is not set. (See also mdopen) - */ - if (isRedo || IsBootstrapProcessingMode()) + if (isRedo) fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); if (fd < 0) { @@ -572,26 +566,15 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) if (fd < 0) { - /* - * During bootstrap, there are cases where a system relation will be - * accessed (by internal backend processes) before the bootstrap - * script nominally creates it. Therefore, accept mdopen() as a - * substitute for mdcreate() in bootstrap mode only. (See mdcreate) - */ - if (IsBootstrapProcessingMode()) - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); - if (fd < 0) + if ((behavior & EXTENSION_RETURN_NULL) && + FILE_POSSIBLY_DELETED(errno)) { - if ((behavior & EXTENSION_RETURN_NULL) && -FILE_POSSIBLY_DELETED(errno)) - { -pfree(path); -return NULL; - } - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); + pfree(path); + return NULL; } + ereport(ERROR, +(errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); } pfree(path);
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/17/18, Tom Lane wrote: > John Naylor writes: >> Since PL/pgSQL uses the core scanner, we'd need to use offsets in its >> reserved_keywords[], too. Those don't change much, so we can probably >> get away with hard-coding the offsets and the giant string in that >> case. (If that's not acceptable, we could separate that out to >> pl_reserved_kwlist.h and reuse the above tooling to generate >> pl_reserved_kwlist_{offset,string}.h, but that's more complex.) > > plpgsql isn't as stable as all that: people propose new syntax for it > all the time. I do not think a hand-maintained array would be pleasant > at all. Okay. > Also, wouldn't we also adopt this technology for its unreserved keywords, > too? We wouldn't be forced to, but there might be other reasons to do so. Were you thinking of code consistency (within pl_scanner.c or globally)? Or something else? If we did adopt this setup for plpgsql unreserved keywords, ecpg/preproc/ecpg_keywords.c and ecpg/preproc/c_keywords.c would be left using the current ScanKeyword struct for search. Using offset search for all 5 types of keywords would be globally consistent, but it also means additional headers, generated headers, and makefile rules. -John Naylor
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/20/18, Tom Lane wrote: > I'd be inclined to put the script in src/tools, I think. IMO src/common > is for code that actually gets built into our executables. Done. >> which takes >> pl_unreserved_kwlist.h as input and outputs >> pl_unreserved_kwlist_offset.h and pl_unreserved_kwlist_string.h. > > I wonder whether we'd not be better off producing just one output > file, in which we have the offsets emitted as PG_KEYWORD macros > and then the giant string emitted as a macro definition, ie > something like > > #define PG_KEYWORD_STRING \ > "absolute\0" \ > "alias\0" \ > ... > > That simplifies the Makefile-hacking, at least, and it possibly gives > callers more flexibility about what they actually want to do with the > string. Okay, I tried that. Since the script is turning one header into another, I borrowed the "*_d.h" nomenclature from the catalogs. Using a single file required some #ifdef hacks in the output file. Maybe there's a cleaner way to do this, but I don't know what it is. Using a single file also gave me another idea: Take value and category out of ScanKeyword, and replace them with an index into another array containing those, which will only be accessed in the event of a hit. That would shrink ScanKeyword to 4 bytes (offset, index), further increasing locality of reference. Might not be worth it, but I can try it after moving on to the core scanner. > I'm for "not change things unnecessarily". People might well be > scraping the keyword list out of parser/kwlist.h for other purposes > right now --- indeed, it's defined the way it is exactly to let > people do that. I don't see a good reason to force them to redo > whatever tooling they have that depends on that. So let's build > kwlist_offsets.h alongside that, but not change kwlist.h itself. Done. >> I used the global .gitignore, but maybe that's an abuse of it. > > Yeah, I'd say it is. Moved. >> +# TODO: Error out if the keyword names are not in ASCII order. > > +many for including such a check. Done. > Also note that we don't require people to have Perl installed when > building from a tarball. Therefore, these derived headers must get > built during "make distprep" and removed by maintainer-clean but > not distclean. I think this also has some implications for VPATH > builds, but as long as you follow the pattern used for other > derived header files (e.g. fmgroids.h), you should be fine. Done. I also blindly added support for MSVC. -John Naylor src/common/keywords.c | 55 src/include/common/keywords.h | 14 +++ src/pl/plpgsql/src/.gitignore | 1 + src/pl/plpgsql/src/Makefile | 9 +- src/pl/plpgsql/src/pl_scanner.c | 119 ++ src/pl/plpgsql/src/pl_unreserved_kwlist.h | 107 +++ src/tools/gen_keywords.pl | 136 ++ src/tools/msvc/Solution.pm| 10 +++ 8 files changed, 356 insertions(+), 95 deletions(-) diff --git a/src/common/keywords.c b/src/common/keywords.c index 0c0c794c68..0fb14a0372 100644 --- a/src/common/keywords.c +++ b/src/common/keywords.c @@ -112,3 +112,58 @@ ScanKeywordLookup(const char *text, return NULL; } + +/* Like ScanKeywordLookup, but uses offsets into a keyword string. */ +const ScanKeywordOffset * +ScanKeywordLookupOffset(const char *text, + const ScanKeywordOffset *keywords, + int num_keywords, + const char *keyword_string) +{ + int len, +i; + char word[NAMEDATALEN]; + const ScanKeywordOffset *low; + const ScanKeywordOffset *high; + + len = strlen(text); + /* We assume all keywords are shorter than NAMEDATALEN. */ + if (len >= NAMEDATALEN) + return NULL; + + /* + * Apply an ASCII-only downcasing. We must not use tolower() since it may + * produce the wrong translation in some locales (eg, Turkish). + */ + for (i = 0; i < len; i++) + { + char ch = text[i]; + + if (ch >= 'A' && ch <= 'Z') + ch += 'a' - 'A'; + word[i] = ch; + } + word[len] = '\0'; + + /* + * Now do a binary search using plain strcmp() comparison. + */ + low = keywords; + high = keywords + (num_keywords - 1); + while (low <= high) + { + const ScanKeywordOffset *middle; + int difference; + + middle = low + (high - low) / 2; + difference = strcmp(keyword_string + middle->offset, word); + if (difference == 0) + return middle; + else if (difference < 0) + low = middle + 1; + else + high = middle - 1; + } + + return NULL; +} diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h index 0b31505b66..7cd7bf4461 100644 --- a/src/include/common/keywords.h +++ b/src/include/common/keywords.h @@ -28,11 +28,20 @@ typedef struct ScanKeyword int16 category; /*
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/22/18, Tom Lane wrote: > John Naylor writes: >> Using a single file also gave me another idea: Take value and category >> out of ScanKeyword, and replace them with an index into another array >> containing those, which will only be accessed in the event of a hit. >> That would shrink ScanKeyword to 4 bytes (offset, index), further >> increasing locality of reference. Might not be worth it, but I can try >> it after moving on to the core scanner. > > I like that idea a *lot*, actually, because it offers the opportunity > to decouple this mechanism from all assumptions about what the > auxiliary data for a keyword is. Okay, in that case I went ahead and did it for WIP v3. > (it'd be a good idea to have a switch that allows specifying the > prefix of these constant names). Done as an optional switch, and tested, but not yet used in favor of the previous method as a fallback. I'll probably do it in the final version to keep lines below 80, and to add 'core_' to the core keyword vars. > /* Payload data for keywords */ > typedef struct MyKeyword > { > int16value; > int16category; > } MyKeyword; I tweaked this a bit to typedef struct ScanKeywordAux { int16 value; /* grammar's token code */ charcategory; /* see codes above */ } ScanKeywordAux; It seems that category was only 2 bytes to make ScanKeyword a power of 2 (of course that was on 32 bit machines and doesn't hold true anymore). Using char will save another few hundred bytes in the core scanner. Since we're only accessing this once per identifier, we may not need to worry so much about memory alignment. > Aside from being arguably better from the locality-of-reference > standpoint, this gets us out of the weird ifdef'ing you've got in > the v2 patch. The kwlist_d.h headers can be very ordinary headers. Yeah, that's a nice (and for me unexpected) bonus. -John Naylor src/common/keywords.c | 55 src/include/common/keywords.h | 12 +++ src/pl/plpgsql/src/.gitignore | 1 + src/pl/plpgsql/src/Makefile | 13 +-- src/pl/plpgsql/src/pl_scanner.c | 127 +++ src/pl/plpgsql/src/pl_unreserved_kwlist.h | 107 +++ src/tools/gen_keywords.pl | 139 ++ src/tools/msvc/Solution.pm| 10 +++ 8 files changed, 361 insertions(+), 103 deletions(-) diff --git a/src/common/keywords.c b/src/common/keywords.c index 0c0c794c68..b0e5a721b6 100644 --- a/src/common/keywords.c +++ b/src/common/keywords.c @@ -112,3 +112,58 @@ ScanKeywordLookup(const char *text, return NULL; } + +/* Like ScanKeywordLookup, but uses offsets into a keyword string. */ +int +ScanKeywordLookupOffset(const char *string_to_lookup, + const char *kw_strings, + const uint16 *kw_offsets, + int num_keywords) +{ + int len, +i; + char word[NAMEDATALEN]; + const uint16 *low; + const uint16 *high; + + len = strlen(string_to_lookup); + /* We assume all keywords are shorter than NAMEDATALEN. */ + if (len >= NAMEDATALEN) + return -1; + + /* + * Apply an ASCII-only downcasing. We must not use tolower() since it may + * produce the wrong translation in some locales (eg, Turkish). + */ + for (i = 0; i < len; i++) + { + char ch = string_to_lookup[i]; + + if (ch >= 'A' && ch <= 'Z') + ch += 'a' - 'A'; + word[i] = ch; + } + word[len] = '\0'; + + /* + * Now do a binary search using plain strcmp() comparison. + */ + low = kw_offsets; + high = kw_offsets + (num_keywords - 1); + while (low <= high) + { + const uint16 *middle; + int difference; + + middle = low + (high - low) / 2; + difference = strcmp(kw_strings + *middle, word); + if (difference == 0) + return middle - kw_offsets; + else if (difference < 0) + low = middle + 1; + else + high = middle - 1; + } + + return -1; +} diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h index 0b31505b66..201d0fcc7a 100644 --- a/src/include/common/keywords.h +++ b/src/include/common/keywords.h @@ -28,6 +28,13 @@ typedef struct ScanKeyword int16 category; /* see codes above */ } ScanKeyword; +/* Payload data for keywords */ +typedef struct ScanKeywordAux +{ + int16 value; /* grammar's token code */ + char category; /* see codes above */ +} ScanKeywordAux; + #ifndef FRONTEND extern PGDLLIMPORT const ScanKeyword ScanKeywords[]; extern PGDLLIMPORT const int NumScanKeywords; @@ -41,4 +48,9 @@ extern const ScanKeyword *ScanKeywordLookup(const char *text, const ScanKeyword *keywords, int num_keywords); +int ScanKeywordLookupOffset(const char *string_to_lookup, + const char *kw_strings, + const uint16 *kw_offsets, + int num_keywords); + #endif /* KEYWORDS_H */ diff --git a/src/pl/plpgs
Re: Speeding up text_position_next with multibyte encodings
On 12/22/18, Heikki Linnakangas wrote: > On 14/12/2018 20:20, John Naylor wrote: > I'm afraid that script doesn't work as a performance test. The > position() function is immutable, so the result gets cached in the plan > cache. All you're measuring is the speed to get the constant from the > plan cache :-(. That makes perfect sense now. I should have been more skeptical about the small and medium sizes having similar times. :/ > I rewrote the same tests with a little C helper function, attached, to > fix that, and to eliminate any PL/pgSQL overhead. Thanks for that, I'll probably have occasion to do something like this for other tests. > You chose interesting characters for the UTF-8 test. The haystack is a > repeating pattern of byte sequence EC 99 84, and the needle is a > repeating pattern of EC 84 B1. In the 'long' test, the lengths in the > skip table are '2', '1' and '250'. But the search bounces between the > '2' and '1' cases, and never hits the byte that would allow it to skip > 250 bytes. Interesting case, I had not realized that that can happen. Me neither, that was unintentional. > But I don't think we need to put much weight on that, you could come up > with other scenarios where the current code has skip table collisions, too. Okay. > So overall, I think it's still a worthwhile tradeoff, given that that is > a worst case scenario. If you choose less pathological UTF-8 codepoints, > or there is no match or the match is closer to the beginning of the > string, the patch wins. On 12/23/18, Tomas Vondra wrote: > So, what is the expected speedup in a "good/average" case? Do we have > some reasonable real-world workload mixing these cases that could be > used as a realistic benchmark? I'll investigate some "better" cases. -John Naylor
Re: automatically assigning catalog toast oids
On 12/11/18, Andres Freund wrote: > I've attached a patch implementing that. I'm not particularly in love > with FirstGenbkiObjectId as the symbol, but I couldn't think of > something more descriptive. How about FirstAutoAssignedObjectId? -John Naylor
reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 10/15/18, Tom Lane wrote: > Andres Freund writes: >> On 2018-10-15 16:36:26 -0400, Tom Lane wrote: >>> We could possibly fix these by changing the data structure so that >>> what's in a ScanKeywords entry is an offset into some giant string >>> constant somewhere. No idea how that would affect performance, but >>> I do notice that we could reduce the sizeof(ScanKeyword), which can't >>> hurt. > >> Yea, that might even help performancewise. Alternatively we could change >> ScanKeyword to store the keyword name inline, but that'd be a measurable >> size increase... > > Yeah. It also seems like doing it this way would improve locality of > access: the pieces of the giant string would presumably be in the same > order as the ScanKeywords entries, whereas with the current setup, > who knows where the compiler has put 'em or in what order. > > We'd need some tooling to generate the constants that way, though; > I can't see how to make it directly from kwlist.h. A few months ago I was looking into faster search algorithms for ScanKeywordLookup(), so this is interesting to me. While an optimal full replacement would be a lot of work, the above ideas are much less invasive and would still have some benefit. Unless anyone intends to work on this, I'd like to flesh out the offset-into-giant-string approach a bit further: Since there are several callers of the current approach that don't use the core keyword list, we'd have to keep the existing struct and lookup function, to keep the complexity manageable. Once we have an offset-based struct and function, it makes sense to use it for all searches of core keywords. This includes not only the core scanner, but also adt/rule_utils.c, fe_utils/string_utils.c, and ecpg/preproc/keywords.c. There would need to be a header with offsets replacing name strings, generated from parser/kwlist.h, maybe kwlist_offset.h. It'd probably be convenient if it was emitted into the common/ dir. The giant string would likely need its own header (kwlist_string.h?). Since PL/pgSQL uses the core scanner, we'd need to use offsets in its reserved_keywords[], too. Those don't change much, so we can probably get away with hard-coding the offsets and the giant string in that case. (If that's not acceptable, we could separate that out to pl_reserved_kwlist.h and reuse the above tooling to generate pl_reserved_kwlist_{offset,string}.h, but that's more complex.) The rest should be just a SMOP. Any issues I left out? -John Naylor
Re: inconsistency and inefficiency in setup_conversion()
On 12/1/18, Dmitry Dolgov <9erthali...@gmail.com> wrote: > I see that the author keeps patch updated, but I'm a bit worried because of > lack of full review since probably May. I'm moving it to the next CF, let's > see > if there would be more feedback. > > P.S. adding Daniel, since he is assigned as a reviewer. Having heard nothing in a while, I've removed Daniel as a reviewer to make room for someone else. He is, of course free to re-add himself. v8 is attached. Since it's been a few months since last discussion, I'd like to summarize the purpose of this patch and advocate for its inclusion in v12: 1. Correctness In the intro thread [1], I showed that object comments on some conversions are wrong, and hard to fix given the current setup. This is a documentation bug of sorts. 2. Clean-up Currently, utils/mb/conversion_procs/Makefile has an ad-hoc script to generate the SQL file, which has to be duplicated in the MSVC tooling, and executed by initdb.c. Storing the conversions in .dat files removes the need for any of that. 3. Performance This patch shaves 5-6% off of initdb. Not as much as hoped, but still a nice bonus. [1] https://www.postgresql.org/message-id/CAJVSVGWtUqxpfAaxS88vEGvi%2BjKzWZb2EStu5io-UPc4p9rSJg%40mail.gmail.com --John Naylor From bdbdb36129708d1d417f8db9009b377c3d4e3e90 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 14 Dec 2018 15:19:54 -0500 Subject: [PATCH v8 1/2] Add pg_language lookup. This didn't seem worth doing before, but an upcoming commit will add 88 entries to pg_proc whose languge is 'c'. In addition, we can now use 'internal' in Gen_fmgr.pl instead of looking up the OID. This simplifies that script and its Makefile a bit. --- doc/src/sgml/bki.sgml| 6 +-- src/backend/catalog/genbki.pl| 8 +++ src/backend/utils/Gen_fmgrtab.pl | 7 ++- src/backend/utils/Makefile | 8 +-- src/include/catalog/pg_proc.dat | 92 src/include/catalog/pg_proc.h| 2 +- src/tools/msvc/Solution.pm | 4 +- 7 files changed, 64 insertions(+), 63 deletions(-) diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 786abb95d4..37bb7a2346 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -409,8 +409,8 @@ that's error-prone and hard to understand, so for frequently-referenced catalogs, genbki.pl provides mechanisms to write symbolic references instead. Currently this is possible for references -to access methods, functions, operators, opclasses, opfamilies, and -types. The rules are as follows: +to access methods, functions, languages, operators, opclasses, opfamilies, +and types. The rules are as follows: @@ -420,7 +420,7 @@ Use of symbolic references is enabled in a particular catalog column by attaching BKI_LOOKUP(lookuprule) to the column's definition, where lookuprule - is pg_am, pg_proc, + is pg_am, pg_proc, pg_language, pg_operator, pg_opclass, pg_opfamily, or pg_type. BKI_LOOKUP can be attached to columns of diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index 8e2a2480be..8ccfcb7724 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -182,6 +182,13 @@ foreach my $row (@{ $catalog_data{pg_am} }) $amoids{ $row->{amname} } = $row->{oid}; } +# language OID lookup +my %langoids; +foreach my $row (@{ $catalog_data{pg_language} }) +{ + $langoids{ $row->{lanname} } = $row->{oid}; +} + # opclass OID lookup my %opcoids; foreach my $row (@{ $catalog_data{pg_opclass} }) @@ -256,6 +263,7 @@ foreach my $row (@{ $catalog_data{pg_type} }) # Map catalog name to OID lookup. my %lookup_kind = ( pg_am => \%amoids, + pg_language => \%langoids, pg_opclass => \%opcoids, pg_operator => \%operoids, pg_opfamily => \%opfoids, diff --git a/src/backend/utils/Gen_fmgrtab.pl b/src/backend/utils/Gen_fmgrtab.pl index ed16737a6a..b6809c7277 100644 --- a/src/backend/utils/Gen_fmgrtab.pl +++ b/src/backend/utils/Gen_fmgrtab.pl @@ -59,6 +59,8 @@ die "No include path; you must specify -I.\n" if !$include_path; # Note: We pass data file names as arguments and then look for matching # headers to parse the schema from. This is backwards from genbki.pl, # but the Makefile dependencies look more sensible this way. +# We currently only need pg_proc, but retain the possibility of reading +# more than one data file. my %catalogs; my %catalog_data; foreach my $datfile (@input_files) @@ -82,9 +84,6 @@ foreach my $datfile (@input_files) my $FirstGenbkiObjectId = Catalog::FindDefinedSymbol('access/transam.h', $include_path, 'FirstGenbkiObjectId'); -my $INTERNALlanguageId = - Catalog::FindDefinedSymbolFromData($catalog_data{pg_language}, - 'INTERNALlanguageId'); # Collect certain fields from pg_proc.dat. my @fmgr = (); @@ -94,7 +93,7 @@ foreach my $row (@{ $catalog_data{pg_proc
Re: Speeding up text_position_next with multibyte encodings
On 12/14/18, John Naylor wrote: > I signed up to be a reviewer, and I will be busy next month, so I went > ahead and fixed the typo in the patch that broke assert-enabled > builds. While at it, I standardized on the spelling "start_ptr" in a > few places to match the rest of the file. It's a bit concerning that > it wouldn't compile with asserts, but the patch was written by a > committer and seems to work. I just noticed that the contrib/citext test fails. I've set the status to waiting on author. -John Naylor
Re: automatically assigning catalog toast oids
> On 12/13/18, Tom Lane wrote: > Looking at the existing entries, it seems like we'd have to have > one special case: schema public has OID 2200 but is intentionally > not pinned. setup_depend() mentions other exceptions, but only this one currently has any effect as far as I can see: "pg_database: it's a feature, not a bug, that template1 is not pinned." -John Naylor
Re: Speeding up text_position_next with multibyte encodings
On 11/30/18, Dmitry Dolgov <9erthali...@gmail.com> wrote: > Unfortunately, patch doesn't compile anymore due: > > varlena.c: In function ‘text_position_next_internal’: > varlena.c:1337:13: error: ‘start_ptr’ undeclared (first use in this > function) > Assert(start_ptr >= haystack && start_ptr <= haystack_end); > > Could you please send an updated version? For now I'm moving it to the next > CF. I signed up to be a reviewer, and I will be busy next month, so I went ahead and fixed the typo in the patch that broke assert-enabled builds. While at it, I standardized on the spelling "start_ptr" in a few places to match the rest of the file. It's a bit concerning that it wouldn't compile with asserts, but the patch was written by a committer and seems to work. On 10/19/18, Heikki Linnakangas wrote: > This is a win in most cases. One case is slower: calling position() with > a large haystack input, where the match is near the end of the string. I wanted to test this case in detail, so I ran the attached script, which runs position() in three scenarios: short: 1 character needle at the end of a ~1000 character haystack, repeated 1 million times medium: 50 character needle at the end of a ~1 million character haystack, repeated 1 million times long: 250 character needle at the end of an ~80 million character haystack (~230MB, comfortably below the 256MB limit Heikki reported), done just one time I took the average of 5 runs using Korean characters in both UTF-8 (3-byte) and EUC-KR (2-byte) encodings. UTF-8 length master patch short 2.26s 2.51s med 2.28s 2.54s long3.07s 3.11s EUC-KR length master patch short 2.29s 2.37s med 2.29s 2.36s long1.75s 1.71s With UTF-8, the patch is 11-12% slower on short and medium strings, and about the same on long strings. With EUC-KR, the patch is about 3% slower on short and medium strings, and 2-3% faster on long strings. It seems the worst case is not that bad, and could be optimized, as Heikki said. -John Naylor From ca7a228174acb8b85b87642bb6df182139587c05 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 18 Nov 2018 17:40:09 +0700 Subject: [PATCH v2] Use single-byte Boyer-Moore-Horspool search even with multibyte encodings. The old implementation first converted the input strings to arrays of wchars, and performed the on those. However, the conversion is expensive, and for a large input string, consumes a lot of memory. Avoid the conversion, and instead use the single-byte algorithm even with multibyte encodings. That has a couple of problems, though. Firstly, we might get fooled if the needle string's byte sequence appears embedded inside a different string. We might incorrectly return a match in the middle of a multi-byte character. The second problem is that the text_position function needs to return the position of the match, counted in characters, rather than bytes. We can work around both of those problems by an extra step after finding a match. Walk the string one character at a time, starting from the beginning, until we reach the position of the match. We can then check if the match was found at a valid character boundary, which solves the first problem, and we can count the characters, so that we can return the character offset. Walking the characters is faster than the wchar conversion, especially in the case that there are no matches and we can avoid it altogether. Also, avoiding the large allocation allows these functions to work with inputs larger than 256 MB, even with multibyte encodings. For UTF-8, we can even skip the step to verify that the match lands on a character boundary, because in UTF-8, the byte sequence of one character cannot appear in the middle of a different character. I think many other encodings have the same property, but I wasn't sure, so I only enabled that optimization for UTF-8. Most of the callers of the text_position_setup/next functions were actually not interested in the position of the match, counted in characters. To cater for them, refactor the text_position_next() interface into two parts: searching for the next match (text_position_next()), and returning the current match's position as a pointer (text_position_get_match_ptr()) or as a character offset (text_position_get_match_pos()). Most callers of text_position_setup/next are not interested in the character offsets, so getting the pointer to the match within the original string is a more convenient API for them. It also allows skipping the character counting with UTF-8 encoding altogether. --- src/backend/utils/adt/varlena.c | 475 +--- 1 file changed, 253 insertions(+), 222 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 0fd3b15748..90
Re: WIP: Avoid creation of the free space map for small tables
On 11/29/18, Amit Kapila wrote: > On Thu, Nov 29, 2018 at 3:07 PM John Naylor wrote: >> Done. I tried adding it to several schedules, but for some reason >> vacuuming an empty table failed to truncate the heap to 0 blocks. >> Putting the test in its own group fixed the problem, but that doesn't >> seem ideal. >> > > It might be because it fails the should_attempt_truncation() check. > See below code: > > if (should_attempt_truncation(vacrelstats)) > lazy_truncate_heap(onerel, vacrelstats, vac_strategy); I see. I think truncating the FSM is not essential to show either the old or new behavior -- I could skip that portion to enable running the test in a parallel group. >> Can you please repeat the copy test you have done above with >> fillfactor as 20 and 30? > > I will send the results in a separate email soon. I ran the attached scripts which populates 100 tables with either 4 or 8 blocks. The test tables were pre-filled with one tuple and vacuumed so that the FSMs were already created when testing the master branch. The patch branch was compiled with a threshold of 8, but testing inserts of 4 pages effectively simulates a threshold of 4. Config was stock, except for fsync = off. I took the average of 40 runs (2 complete tests of 20 runs each) after removing the 10% highest and lowest: fillfactor=20 # blocksmaster patch 4 19.1ms 17.5ms 8 33.4ms 30.9ms fillfactor=30 # blocksmaster patch 4 20.1ms 19.7ms 8 34.7ms 34.9ms It seems the patch might be a bit faster with fillfactor=20, but I'm at a loss as to why that would be. Previous testing with a higher threshold showed a significant performance penalty starting around 10 blocks [1], but that used truncation rather than deletion, and had a fill-factor of 10. -- [1] https://www.postgresql.org/message-id/CAJVSVGWCRMyi8sSqguf6PfFcpM3hwNY5YhPZTt-8Q3ZGv0UGYw%40mail.gmail.com -John Naylor fsm-copy-setup.sql Description: application/sql fsm-copy-test-v3.sql Description: application/sql
Re: WIP: Avoid creation of the free space map for small tables
On 12/6/18, Amit Kapila wrote: > On Thu, Dec 6, 2018 at 10:53 PM John Naylor wrote: >> >> I've added an additional regression test for finding the right block >> and removed a test I thought was redundant. I've kept the test file in >> its own schedule. >> > > +# -- > +# fsm does a vacuum, and running it in parallel seems to prevent heap > truncation. > +# -- > +test: fsm > + > > It is not clear to me from the comment why running it in parallel > prevents heap truncation, can you explain what behavior are you seeing > and what makes you think that running it in parallel caused it? One of the tests deletes all records from the relation and vacuums. In serial schedule, the heap and FSM are truncated; in parallel they are not. Make check fails, since since the tests measure relation size. Taking a closer look, I'm even more alarmed to discover that vacuum doesn't even seem to remove deleted rows in parallel schedule (that was in the last test I added), which makes no sense and causes that test to fail. I looked in vacuum.sql for possible clues, but didn't see any. I'll have to investigate further. -John Naylor
Re: Thinking about EXPLAIN ALTER TABLE
On 12/7/18, Greg Stark wrote: > I've been poking around with a feature I've wanted a number of times > in the past, "EXPLAIN ALTER TABLE". I believe I've seen your messages to that effect in the archives, so I've had it in the back of my mind as well. I think it could be very useful. > 3. Whether a full table constraint validation is going to happen Is it possible that this can occur via index-only scan and that this feature could know that? > For the most part ALTER TABLE is already structured such that this is > pretty easy. It does a lot of preparatory work without doing catalog > updates and I can just call that same preparatory work without calling > the subsequent work phases. One thing that came to mind: Since the input is not a plan tree, it seems it would be more difficult to keep EXPLAIN in sync with additional ALTER TABLE features. Do you have any thoughts about that? > postgres***=# explain alter table t alter column i set not null; > ┌─┐ > │ QUERY PLAN│ > ├─┤ > │ Lock Level: AccessExclusiveLock │ > │ ALTER TABLE: t │ > │ Relation: t │ > │ Rewrite: none │ > │ Relation: t2 │ > │ Rewrite: none │ > └─┘ In this example, I assume the two relations are partitions? With many partitions, this could get unwieldy pretty fast. I imagine there could be a shorthand notation. -John Naylor
docs: outdated reference to recursive expression evaluation
In confg.sgml, in the section about max_stack_depth, there's this sentence: "The safety margin is needed because the stack depth is not checked in every routine in the server, but only in key potentially-recursive routines such as expression evaluation." Since the change in expression evaluation in v10, there's probably a better example of recursive routines, but I'm not sure what that would be. -John Naylor
Re: WIP: Avoid creation of the free space map for small tables
On 12/8/18, Amit Kapila wrote: > On Fri, Dec 7, 2018 at 7:25 PM John Naylor wrote: > I couldn't resist the temptation to figure out what's going on here. > The newly added tests have deletes followed by vacuum and then you > check whether the vacuum has removed the data by checking heap and or > FSM size. Now, when you run such a test in parallel, the vacuum can > sometimes skip removing the rows because there are parallel > transactions open which can see the deleted rows. Ah yes, of course. > You can easily > verify this phenomenon by running the newly added tests in one session > in psql when there is another parallel session which has an open > transaction. For example: > > Session-1 > Begin; > Insert into foo values(1); > > Session-2 > \i fsm.sql > > Now, you should see the results similar to what you are seeing when > you ran the fsm test by adding it to one of the parallel group. Can > you test this at your end and confirm whether my analysis is correct > or not. Yes, I see the same behavior. > So, you can keep the test as you have in parallel_schedule, but > comment needs to be changed. Also, you need to add the new test in > serial_schedule. I have done both the changes in the attached patch, > kindly confirm if this looks correct to you. Looks good to me. I'll just note that the new line in the serial schedule has an extra space at the end. Thanks for looking into this. -John Naylor
Re: WIP: Avoid creation of the free space map for small tables
On 12/1/18, Amit Kapila wrote: > Can you check whether the number of pages after test are the same with > and without a patch in this setup? I did verify that the number of pages was as intended. -John Naylor
Re: WIP: Avoid creation of the free space map for small tables
On 12/3/18, Amit Kapila wrote: >> During pg_upgrade, skip transfer of FSMs if they wouldn't have been >> created on the new cluster. > > I think in some cases, it won't be consistent with HEAD's behavior. > After truncate, we leave the FSM as it is, so the case where before > upgrade the relation was truncated, we won't create the FSM in new > cluster and that will be inconsistent with the behavior of HEAD. To be precise, with the TRUNCATE statement, the FSM (everything but the main relation fork, I think) is deleted, but using DELETE to remove all rows from the table will preserve the forks. In the latter case, when VACUUM truncates the FSM, it removes all leaf pages leaving behind the root page and one mid-level page. I haven't changed this behavior. > I think similar anomaly will be there when we delete rows from the table > such that after deletion size of relation becomes smaller than > HEAP_FSM_CREATION_THRESHOLD. Yes, in that case there will be inconsistency, but I'm comfortable with it. Others may not be. > I am not sure if it is a good idea to *not* transfer FSM files during > upgrade unless we ensure that we remove FSM whenever the relation size > falls below HEAP_FSM_CREATION_THRESHOLD. What do you think? BTW, > what is your reasoning for not removing FSM on truncate? My reasoning is that if we ever went past the threshold, it's likely we'll do so again, so I didn't feel it was worth the extra code complexity to remove the FSM. In the pg_upgrade case, however, it is simple to not copy the FSM. -John Naylor
Re: WIP: Avoid creation of the free space map for small tables
On 12/3/18, Amit Kapila wrote: > On Mon, Dec 3, 2018 at 9:46 AM Amit Kapila wrote: >> >> On Thu, Nov 29, 2018 at 3:07 PM John Naylor wrote: >> > >> > v8 code: > +fsm_local_set(Relation rel, BlockNumber new_nblocks) > +{ > + BlockNumber blkno, > + cached_target_block; > + > + /* > + * Mark blocks available starting after the last block number we have > + * cached, and ending at the current last block in the relation. > + * When we first set the map, this will flag all blocks as available > + * to try. If we reset the map while waiting for a relation > + * extension lock, this will only flag new blocks as available, > + * if any were created by another backend. > + */ > + for (blkno = fsm_local_map.nblocks; blkno < new_nblocks; blkno++) > + fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL; > > v9 code: > +static void > +fsm_local_set(Relation rel, BlockNumber nblocks) > +{ > + BlockNumber blkno, > + cached_target_block; > + > + for (blkno = 0; blkno < nblocks; blkno++) > + fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL; > > What is the reason for the above code change in the latest patch version? Per your recent comment, we no longer check relation size if we waited on a relation extension lock, so this is essentially a reversion to an earlier version. Keeping v8 would have the advantage that it'd be simple to change our minds about this. Do you have an opinion about that? > It would be good if you add few comments atop functions > GetPageWithFreeSpace, RecordAndGetPageWithFreeSpace and > RecordPageWithFreeSpace about their interaction with local map. Good idea, will do. -John Naylor
Re: automatically assigning catalog toast oids
On 12/9/18, Tom Lane wrote: > Another thing I seriously dislike is that this allows people to omit OIDs > from .dat entries in catalogs where we traditionally hand-assign OIDs. > That's not a good idea because it would mean those entries don't have > stable OIDs, whereas the whole point of hand assignment is to ensure > all built-in objects of a particular type have stable OIDs. Now, you > could argue about the usefulness of that policy for any given catalog; > but if we decide that catalog X doesn't need stable OIDs then that should > be an intentional policy change, not something that can happen because > one lazy hacker didn't follow the policy. On this point, I believe this could have happened anyway. pg_opclass has a mix of hand- and initdb-assigned oids, and there was nothing previously to stop that from creeping into any other catalog, as far as I can tell. -John Naylor
Re: docs: outdated reference to recursive expression evaluation
On 12/8/18, Tom Lane wrote: > John Naylor writes: >> In confg.sgml, in the section about max_stack_depth, there's this >> sentence: >> "The safety margin is needed because the stack depth is not checked in >> every routine in the server, but only in key potentially-recursive >> routines such as expression evaluation." > >> Since the change in expression evaluation in v10, there's probably a >> better example of recursive routines, but I'm not sure what that would >> be. > > We could say "expression compilation" and it'd still be valid. Or just > drop the last four words altogether. I don't think we want to expend the > verbiage to be more precise here, since it's only a tangential point. I'm inclined to agree. If you like, here's a patch to leave out the example. -John Naylor diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index fc4f17be41..4a7121a51f 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1675,12 +1675,11 @@ include_dir 'conf.d' enforced by the kernel (as set by ulimit -s or local equivalent), less a safety margin of a megabyte or so. The safety margin is needed because the stack depth is not checked in every -routine in the server, but only in key potentially-recursive routines -such as expression evaluation. The default setting is two -megabytes (2MB), which is conservatively small and -unlikely to risk crashes. However, it might be too small to allow -execution of complex functions. Only superusers can change this -setting. +routine in the server, but only in key potentially-recursive routines. +The default setting is two megabytes (2MB), which +is conservatively small and unlikely to risk crashes. However, +it might be too small to allow execution of complex functions. +Only superusers can change this setting.
Re: WIP: Avoid creation of the free space map for small tables
> Is the point 3 change related to pgindent? I think even if you want > these, then don't prepare other patches on top of this, keep it > entirely separate. Both removed. >> Also, we don't quite have a consensus on the threshold value, but I >> have set it to 4 pages for v8. If this is still considered too >> expensive (and basic tests show it shouldn't be), I suspect it'd be >> better to interleave the available block numbers as described a couple >> days ago than lower the threshold further. >> > > Can you please repeat the copy test you have done above with > fillfactor as 20 and 30? I will send the results in a separate email soon. > Few more comments: > --- > 1. I think we can add some test(s) to test the new functionality, may > be something on the lines of what Robert has originally provided as an > example of this behavior [1]. Done. I tried adding it to several schedules, but for some reason vacuuming an empty table failed to truncate the heap to 0 blocks. Putting the test in its own group fixed the problem, but that doesn't seem ideal. > 2. > The similar call is required in AbortSubTransaction function as well. > I suggest to add it after pgstat_progress_end_command in both > functions. Done. > 3. >> agree we shouldn't change that without a reason to. One simple idea is >> add a 3rd boolean parameter to GetPageWithFreeSpace() to control >> whether it gives up if the FSM fork doesn't indicate free space, like > I have the exact fix in my mind, so let's do it that way. Done. This also reverts comments and variable names that referred to updating the local map after relation extension. While at it, I changed a couple conditionals to check the locally cached nblocks rather than the threshold. No functional change, but looks more precise. Might save a few cycles as well. >> > 5. Your logic to update FSM on standby seems okay, but can you show >> > some tests which proves its sanity? >> >> I believe to convince myself it was working, I used the individual >> commands in the sql file in [3], then used the size function on the >> secondary. I'll redo that to verify. I've verified the standby behaves precisely as the primary, as far as the aforementioned script goes. -John Naylor From c4bef3bf790745638d42e5eb6b303b85a9a828c4 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 29 Nov 2018 16:31:41 +0700 Subject: [PATCH v9 1/2] Avoid creation of the free space map for small tables. The FSM isn't created if the heap has 4 blocks or fewer. If the last known target block has insufficient space, try every block before extending the heap. If a heap with a FSM is truncated back to below the threshold, the FSM stays around and can be used as usual. --- contrib/pageinspect/expected/page.out | 77 +++ contrib/pageinspect/sql/page.sql | 33 +-- doc/src/sgml/storage.sgml | 13 +- src/backend/access/brin/brin.c| 2 +- src/backend/access/brin/brin_pageops.c| 10 +- src/backend/access/heap/hio.c | 47 +++-- src/backend/access/transam/xact.c | 14 ++ src/backend/commands/vacuumlazy.c | 17 +- src/backend/storage/freespace/README | 11 +- src/backend/storage/freespace/freespace.c | 233 +- src/backend/storage/freespace/indexfsm.c | 6 +- src/include/storage/freespace.h | 9 +- src/test/regress/expected/fsm.out | 55 + src/test/regress/parallel_schedule| 5 + src/test/regress/sql/fsm.sql | 37 15 files changed, 464 insertions(+), 105 deletions(-) create mode 100644 src/test/regress/expected/fsm.out create mode 100644 src/test/regress/sql/fsm.sql diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3fcd9fbe6d..83e5910453 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -1,48 +1,69 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); -VACUUM test1; -- set up FSM +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT g from generate_series(1,1) g; +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; main_0 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; -ERROR: block number 1 is out of range for relation "test1" -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_ra
Re: WIP: Avoid creation of the free space map for small tables
On 12/3/18, Amit Kapila wrote: > On Mon, Dec 3, 2018 at 11:15 AM John Naylor wrote: >> Per your recent comment, we no longer check relation size if we waited >> on a relation extension lock, so this is essentially a reversion to an >> earlier version. >> > > fsm_local_set is being called from RecordAndGetPageWithFreeSpace and > GetPageWithFreeSpace whereas the change we have discussed was specific > to GetPageWithFreeSpace, so not sure if we need any change in > fsm_local_set. Not needed, but I assumed wrongly you'd think it unclear otherwise. I've now restored the generality and updated the comments to be closer to v8. > It would be good if you add few comments atop functions > GetPageWithFreeSpace, RecordAndGetPageWithFreeSpace and > RecordPageWithFreeSpace about their interaction with local map. Done. Also additional minor comment editing. I've added an additional regression test for finding the right block and removed a test I thought was redundant. I've kept the test file in its own schedule. -John Naylor From 7494ff89be7ad4e07f277f3bcc6e866c5527ce45 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 6 Dec 2018 10:49:03 -0600 Subject: [PATCH v10 1/2] Avoid creation of the free space map for small tables. The FSM isn't created if the heap has 4 blocks or fewer. If the last known target block has insufficient space, try every block before extending the heap. If a heap with a FSM is truncated back to below the threshold, the FSM stays around and can be used as usual. --- contrib/pageinspect/expected/page.out | 77 +++ contrib/pageinspect/sql/page.sql | 33 +-- doc/src/sgml/storage.sgml | 13 +- src/backend/access/brin/brin.c| 2 +- src/backend/access/brin/brin_pageops.c| 10 +- src/backend/access/heap/hio.c | 47 ++-- src/backend/access/transam/xact.c | 14 ++ src/backend/commands/vacuumlazy.c | 17 +- src/backend/storage/freespace/README | 11 +- src/backend/storage/freespace/freespace.c | 251 +- src/backend/storage/freespace/indexfsm.c | 6 +- src/include/storage/freespace.h | 9 +- src/test/regress/expected/fsm.out | 62 ++ src/test/regress/parallel_schedule| 5 + src/test/regress/sql/fsm.sql | 46 15 files changed, 498 insertions(+), 105 deletions(-) create mode 100644 src/test/regress/expected/fsm.out create mode 100644 src/test/regress/sql/fsm.sql diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3fcd9fbe6d..83e5910453 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -1,48 +1,69 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); -VACUUM test1; -- set up FSM +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT g from generate_series(1,1) g; +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; main_0 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; -ERROR: block number 1 is out of range for relation "test1" -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; +ERROR: block number 100 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; fsm_0 --- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; - fsm_1 - 8192 -(1 row) - -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10; +ERROR: block number 10 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; vm_0 -- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; -ERROR: block number 1 is out of range for relation "test1" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1; +ERROR: block number 1 is out of range for relation "test_rel_forks" SELECT octet_length(get_raw_page('xxx', 'main', 0)); ERROR: relation "xxx" does not exist -SELECT octet_length(get_raw_page('test1', 'xxx', 0)); +SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0)); ERROR: invalid fork name HINT: Valid fork names are "main", "fsm", "vm", and "init". -SELECT get_r
Re: WIP: Avoid creation of the free space map for small tables
I wrote: > On 11/19/18, Amit Kapila wrote: > [ abortive states ] >> I think it might come from any other place between when you set it and >> before it got cleared (like any intermediate buffer and pin related >> API's). > > Okay, I will look into that. LockBuffer(), visibilitymap_pin(), and GetVisibilityMapPins() don't call errors at this level. I don't immediately see any additional good places from which to clear the local map. -John Naylor
Re: inconsistency and inefficiency in setup_conversion()
v7 is a rebase over recent catalog changes. -John Naylor From 52aff43dcc91733c1b941fed4582d9c0602004d0 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sat, 13 Oct 2018 19:28:08 +0700 Subject: [PATCH v7 1/2] Add pg_language lookup. This didn't seem worth doing before, but an upcoming commit will add 88 entries to pg_proc whose languge is 'c'. In addition, we can now use 'internal' in Gen_fmgr.pl instead of looking up the OID. This simplifies that script and its Makefile a bit. --- doc/src/sgml/bki.sgml| 6 +-- src/backend/catalog/genbki.pl| 8 +++ src/backend/utils/Gen_fmgrtab.pl | 9 ++-- src/backend/utils/Makefile | 8 +-- src/include/catalog/pg_proc.dat | 92 src/include/catalog/pg_proc.h| 2 +- src/tools/msvc/Solution.pm | 4 +- 7 files changed, 65 insertions(+), 64 deletions(-) diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 786abb95d4..37bb7a2346 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -409,8 +409,8 @@ that's error-prone and hard to understand, so for frequently-referenced catalogs, genbki.pl provides mechanisms to write symbolic references instead. Currently this is possible for references -to access methods, functions, operators, opclasses, opfamilies, and -types. The rules are as follows: +to access methods, functions, languages, operators, opclasses, opfamilies, +and types. The rules are as follows: @@ -420,7 +420,7 @@ Use of symbolic references is enabled in a particular catalog column by attaching BKI_LOOKUP(lookuprule) to the column's definition, where lookuprule - is pg_am, pg_proc, + is pg_am, pg_proc, pg_language, pg_operator, pg_opclass, pg_opfamily, or pg_type. BKI_LOOKUP can be attached to columns of diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index edc8ea9f53..3410816882 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -175,6 +175,13 @@ foreach my $row (@{ $catalog_data{pg_am} }) $amoids{ $row->{amname} } = $row->{oid}; } +# language OID lookup +my %langoids; +foreach my $row (@{ $catalog_data{pg_language} }) +{ + $langoids{ $row->{lanname} } = $row->{oid}; +} + # opclass OID lookup my %opcoids; foreach my $row (@{ $catalog_data{pg_opclass} }) @@ -249,6 +256,7 @@ foreach my $row (@{ $catalog_data{pg_type} }) # Map catalog name to OID lookup. my %lookup_kind = ( pg_am => \%amoids, + pg_language => \%langoids, pg_opclass => \%opcoids, pg_operator => \%operoids, pg_opfamily => \%opfoids, diff --git a/src/backend/utils/Gen_fmgrtab.pl b/src/backend/utils/Gen_fmgrtab.pl index ca28291355..87d250f318 100644 --- a/src/backend/utils/Gen_fmgrtab.pl +++ b/src/backend/utils/Gen_fmgrtab.pl @@ -59,6 +59,8 @@ die "No include path; you must specify -I.\n" if !$include_path; # Note: We pass data file names as arguments and then look for matching # headers to parse the schema from. This is backwards from genbki.pl, # but the Makefile dependencies look more sensible this way. +# We currently only need pg_proc, but retain the possibility of reading +# more than one data file. my %catalogs; my %catalog_data; foreach my $datfile (@input_files) @@ -78,13 +80,10 @@ foreach my $datfile (@input_files) $catalog_data{$catname} = Catalog::ParseData($datfile, $schema, 0); } -# Fetch some values for later. +# Fetch a value for later. my $FirstBootstrapObjectId = Catalog::FindDefinedSymbol('access/transam.h', $include_path, 'FirstBootstrapObjectId'); -my $INTERNALlanguageId = - Catalog::FindDefinedSymbolFromData($catalog_data{pg_language}, - 'INTERNALlanguageId'); # Collect certain fields from pg_proc.dat. my @fmgr = (); @@ -94,7 +93,7 @@ foreach my $row (@{ $catalog_data{pg_proc} }) my %bki_values = %$row; # Select out just the rows for internal-language procedures. - next if $bki_values{prolang} ne $INTERNALlanguageId; + next if $bki_values{prolang} ne 'internal'; push @fmgr, { diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index e797539d09..da40f6b6c4 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -31,15 +31,11 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp $(top_ $(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h -FMGR_DATA := $(addprefix $(top_srcdir)/src/include/catalog/,\ - pg_language.dat pg_proc.dat \ - ) - # fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on # the timestamps of the individual output files, because the Perl script # won't update them if they didn't change (to avoid unnecessary recompiles). -fmgr-stamp: Gen_fmgrtab.pl $(catalogdir)/Catalog.pm $(FMGR_DATA) $(top_srcdir)/src/include/access/transam.h - $(PERL) -I $(catalogdir) $< -I $(top_srcdir)/src/include/ $(FMGR_DATA) +fmgr-stamp: Gen_fmgrtab.pl $(catalogd
Re: WIP: Avoid creation of the free space map for small tables
On 11/24/18, Amit Kapila wrote: > On Fri, Nov 23, 2018 at 11:56 AM John Naylor wrote: >> On 11/16/18, Amit Kapila wrote: >> >> > >> > 3. >> > static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 >> > slot, >> > -uint8 newValue, uint8 minValue); >> > + uint8 newValue, uint8 minValue); >> > >> > This appears to be a spurious change. >> >> 2 and 3 are separated into 0001. >> > > Is the point 3 change related to pgindent? I think even if you want > these, then don't prepare other patches on top of this, keep it > entirely separate. There are some places in the codebase that have spaces after tabs for no apparent reason (not to line up function parameters or pointer declarations). pgindent hasn't been able to fix this. If you wish, you are of course free to apply 0001 separately at any time. >> Also, we don't quite have a consensus on the threshold value, but I >> have set it to 4 pages for v8. If this is still considered too >> expensive (and basic tests show it shouldn't be), I suspect it'd be >> better to interleave the available block numbers as described a couple >> days ago than lower the threshold further. >> > > Can you please repeat the copy test you have done above with > fillfactor as 20 and 30? I did two kinds of tests. The first had a fill-factor of 10 [1], the second had the default storage, but I prevented the backend from caching the target block [2], to fully exercise the free space code. Would you like me to repeat the first one with 20 and 30? And do you think it is useful enough to test the copying of 4 blocks and not smaller numbers? > Few more comments: > --- > 1. I think we can add some test(s) to test the new functionality, may > be something on the lines of what Robert has originally provided as an > example of this behavior [1]. Maybe the SQL script attached to [3] (which I probably based on Robert's report) can be cleaned up into a regression test. > 3. > GetPageWithFreeSpace(Relation rel, Size spaceNeeded) > { > .. > + if (target_block == InvalidBlockNumber && > + rel->rd_rel->relkind == RELKIND_RELATION) > + { > + nblocks = RelationGetNumberOfBlocks(rel); > + > + if (nblocks > HEAP_FSM_CREATION_THRESHOLD) > + { > + /* > + * If the FSM knows nothing of the rel, try the last page before > + * we give up and extend. This avoids one-tuple-per-page syndrome > + * during bootstrapping or in a recently-started system. > + */ > + target_block = nblocks - 1; > + } > .. > } > > Moving this check inside GetPageWithFreeSpace has one disadvantage, we > will always consider last block which can have some inadvertent > effects. Consider when this function gets called from > RelationGetBufferForTuple just before extension, it can consider to > check for the last block even though that is already being done in the > begining when GetPageWithFreeSpace was called. I am not completely > sure how much this is a case to worry because it will help to check > last block when the same is concurrently added and FSM is not updated > for same. I am slightly worried because the unpatched code doesn't > care for such case and we have no intention to change this behaviour. > What do you think? I see what you mean. If the other backend extended by 1 block, the intention is to keep it out of the FSM at first, and by extension, not visible in other ways. The comment implies that's debatable, but I agree we shouldn't change that without a reason to. One simple idea is add a 3rd boolean parameter to GetPageWithFreeSpace() to control whether it gives up if the FSM fork doesn't indicate free space, like if (target_block == InvalidBlockNumber && rel->rd_rel->relkind == RELKIND_RELATION && !check_fsm_only) { nblocks = RelationGetNumberOfBlocks(rel); > 4. You have mentioned above that "system catalogs created during > bootstrap still have a FSM if they have any data." and I can also see > this behavior, have you investigated this point further? Code reading didn't uncover the cause. I might have to step through with a debugger or something similar. I should find time for that next month. > 5. Your logic to update FSM on standby seems okay, but can you show > some tests which proves its sanity? I believe to convince myself it was working, I used the individual commands in the sql file in [3], then used the size function on the secondary. I'll redo that to verify. -- [1] https://www.postgresql.org/message-id/CAJVSVGWCRMyi8sSqguf6PfFcpM3hwNY5YhPZTt-8Q3ZGv0UGYw%40mail.gmail.com [2] https://www.postgresql.org/message-id/CAJVSVGWMXzsqYpPhO3Snz4n5y8Tq-QiviuSCKyB5czCTnq9rzA%40mail.gmail.com [3] https://www.postgresql.org/message-id/CAJVSVGWvB13PzpbLEecFuGFc5V2fsO736BsdTakPiPAcdMM5tQ%40mail.gmail.com -John Naylor
Re: WIP: Avoid creation of the free space map for small tables
On 11/16/18, Amit Kapila wrote: I've attached v8, which includes the 2-state map and addresses the points below: > Some assorted comments: > 1. > > -Each heap and index relation, except for hash indexes, has a Free Space > Map > +Each heap relation, unless it is very small, and each index relation, > +except for hash indexes, has a Free Space Map > (FSM) to keep track of available space in the relation. It's stored > > It appears that line has ended abruptly. Revised. > 2. > page = BufferGetPage(buffer); > + targetBlock = BufferGetBlockNumber(buffer); > > if (!PageIsNew(page)) > elog(ERROR, "page %u of relation \"%s\" should be empty but is not", > - BufferGetBlockNumber(buffer), > + targetBlock, > RelationGetRelationName(relation)); > > PageInit(page, BufferGetPageSize(buffer), 0); > @@ -623,7 +641,18 @@ loop: > * current backend to make more insertions or not, which is probably a > * good bet most of the time. So for now, don't add it to FSM yet. > */ > - RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); > + RelationSetTargetBlock(relation, targetBlock); > > Is this related to this patch? If not, I suggest let's do it > separately if required. > > 3. > static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, > -uint8 newValue, uint8 minValue); > + uint8 newValue, uint8 minValue); > > This appears to be a spurious change. 2 and 3 are separated into 0001. > 4. > @@ -378,24 +386,15 @@ RelationGetBufferForTuple(Relation relation, Size > len, > * target. > */ > targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); > + > + /* > + * In case we used an in-memory map of available blocks, reset > + * it for next use. > + */ > + if (targetBlock < HEAP_FSM_CREATION_THRESHOLD) > + ClearLocalMap(); > > How will you clear the local map during error? I think you need to > clear it in abort path and you can name the function as > FSMClearLocalMap or something like that. Done. I've put this call last before abort processing. > 5. > +/*#define TRACE_TARGETBLOCK */ > > Debugging leftover, do you want to retain this and related stuff > during the development of patch? I have kept it aside as a separate patch but not attached it for now. Also, we don't quite have a consensus on the threshold value, but I have set it to 4 pages for v8. If this is still considered too expensive (and basic tests show it shouldn't be), I suspect it'd be better to interleave the available block numbers as described a couple days ago than lower the threshold further. I have looked at zhio.c, and it seems trivial to adapt zheap to this patchset. -John Naylor From d1752858aaaf048fd94461dbd79fe0bb0f262b98 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 23 Nov 2018 12:55:16 +0700 Subject: [PATCH v8 1/3] Minor cosmetic adjustments for consistency. --- src/backend/access/heap/hio.c | 5 +++-- src/backend/storage/freespace/freespace.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index b8b5871559..7f7c9db635 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -600,10 +600,11 @@ loop: * risk wiping out valid data). */ page = BufferGetPage(buffer); + targetBlock = BufferGetBlockNumber(buffer); if (!PageIsNew(page)) elog(ERROR, "page %u of relation \"%s\" should be empty but is not", - BufferGetBlockNumber(buffer), + targetBlock, RelationGetRelationName(relation)); PageInit(page, BufferGetPageSize(buffer), 0); @@ -623,7 +624,7 @@ loop: * current backend to make more insertions or not, which is probably a * good bet most of the time. So for now, don't add it to FSM yet. */ - RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); + RelationSetTargetBlock(relation, targetBlock); return buffer; } diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 7c4ad1c449..47a991e21d 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -106,7 +106,7 @@ static Size fsm_space_cat_to_avail(uint8 cat); /* workhorse functions for various operations */ static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, - uint8 newValue, uint8 minValue); +uint8 newValue, uint8 minValue); static BlockNumber fsm_search(Relation rel, uint8 min_cat); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, -- 2.17.1 From e6226e8ee3d51e651b3a0b7035ad6676b4cf4bc6 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 23 Nov 2018 12:58:07 +0700 Subject: [PATCH v8 2/3] Avoid creation of the free space map for small tables
Re: Sync ECPG scanner with core
I wrote: > On 11/14/18, Tom Lane wrote: >> Alvaro Herrera writes: >>> Maybe time to compile the ecpg test cases during "make check-world"? >> >> I'm dubious that the lexer is a significant part of that, though I could >> be wrong ... > > If it were, it'd be much easier to try a Flex flag other than the > default, most compact representation, something like -Cfe. That'd be a > prerequisite for no-backtracking to matter anyway. That's easy enough > to test, so I'll volunteer to do that sometime. The preproc phase of make check in the ecpg dir only takes 150ms. Compiling and linking the test executables takes another 2s, and running the tests takes another 5s on my machine. Even without setting up and tearing down the temp instance, preproc is trivial. -John Naylor
Re: [RFC] Removing "magic" oids
On 11/20/18, Andres Freund wrote: > I'm > sure we'll find some things to adapt around the margins, but imo the > patch as a whole looks pretty reasonable. bki.sgml is now a bit out of date. I've attached a draft fix. -John Naylor diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 0fb309a1bd..1919bac936 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -89,7 +89,7 @@ The CATALOG line can also be annotated, with some other BKI property macros described in genbki.h, to define other properties of the catalog as a whole, such as whether - it has OIDs (by default, it does). + it is a shared relation. @@ -354,7 +354,7 @@ also needed if the row's OID must be referenced from C code. If neither case applies, the oid metadata field can be omitted, in which case the bootstrap code assigns an OID -automatically, or leaves it zero in a catalog that has no OIDs. +automatically. In practice we usually preassign OIDs for all or none of the pre-loaded rows in a given catalog, even if only some of them are actually cross-referenced. @@ -387,15 +387,16 @@ through the catalog headers and .dat files to see which ones do not appear. You can also use the duplicate_oids script to check for mistakes. -(genbki.pl will also detect duplicate OIDs +(genbki.pl will assign OIDs for any rows that +didn't get one hand-assigned to them and also detect duplicate OIDs at compile time.) The OID counter starts at 1 at the beginning of a bootstrap run. -If a catalog row is in a table that requires OIDs, but no OID was -preassigned by an oid field, then it will -receive an OID of 1 or above. +If a row from a source other than postgres.bki +is inserted into a table that requires OIDs, then it will receive an +OID of 1 or above. @@ -714,7 +715,6 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat tableoid bootstrap shared_relation - without_oids rowtype_oid oid (name1 = type1 @@ -766,7 +766,6 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat The table is created as shared if shared_relation is specified. - It will have OIDs unless without_oids is specified. The table's row type OID (pg_type OID) can optionally be specified via the rowtype_oid clause; if not specified, an OID is automatically generated for it. (The rowtype_oid @@ -805,7 +804,7 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat - insert OID = oid_value ( value1 value2 ... ) + insert OID = oid_value ( oid_value value1 value2 ... ) @@ -813,11 +812,10 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat Insert a new row into the open table using value1, value2, etc., for its column - values and oid_value for its OID. If - oid_value is zero - (0) or the clause is omitted, and the table has OIDs, then the - next available OID is assigned. + values and oid_value + for its OID. Note that OID is stored in an ordinary column; the + OID = oid_value + statement is only there for debugging purposes. @@ -991,10 +989,10 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat int4 and text, respectively, and insert two rows into the table: -create test_table 420 (cola = int4, colb = text) +create test_table 420 (oid = oid, cola = int4, colb = text) open test_table -insert OID=421 ( 1 "value1" ) -insert OID=422 ( 2 _null_ ) +insert OID=421 ( 421 1 "value1" ) +insert OID=422 ( 422 2 _null_ ) close test_table
Re: WIP: Avoid creation of the free space map for small tables
On 11/16/18, Amit Kapila wrote: > +/* Status codes for the local map. */ > +#define FSM_LOCAL_ZERO 0x00 /* Beyond the end of the relation */ > +#define FSM_LOCAL_AVAIL 0x01 /* Available to try */ > +#define FSM_LOCAL_TRIED 0x02 /* Already tried, not enough space */ > > Instead of maintaining three states, can't we do with two states > (Available and Not Available), basically combine 0 and 2 in your case. > I think it will save some cycles in > fsm_local_set, where each time you need to initialize all the entries > in the map. I think we can argue that it is not much overhead, but I > think it is better code-wise also if we can make it happen with fewer > states. That'd work too, but let's consider this scenario: We have a 2-block table that has no free space. After trying each block, the local cache looks like 0123 TT00 Let's say we have to wait to acquire a relation extension lock, because another backend had already started extending the heap by 1 block. We call GetPageWithFreeSpace() and now the local map looks like 0123 TTA0 By using bitwise OR to set availability, the already-tried blocks remain as they are. With only 2 states, the map would look like this instead: 0123 AAAN If we assume that an insert into the newly-created block 2 will almost always succeed, we don't have to worry about wasting time re-checking the first 2 full blocks. Does that sound right to you? > Some assorted comments: > 1. > > -Each heap and index relation, except for hash indexes, has a Free Space > Map > +Each heap relation, unless it is very small, and each index relation, > +except for hash indexes, has a Free Space Map > (FSM) to keep track of available space in the relation. It's stored > > It appears that line has ended abruptly. Not sure what you're referring to here. > 2. > page = BufferGetPage(buffer); > + targetBlock = BufferGetBlockNumber(buffer); > > if (!PageIsNew(page)) > elog(ERROR, "page %u of relation \"%s\" should be empty but is not", > - BufferGetBlockNumber(buffer), > + targetBlock, > RelationGetRelationName(relation)); > > PageInit(page, BufferGetPageSize(buffer), 0); > @@ -623,7 +641,18 @@ loop: > * current backend to make more insertions or not, which is probably a > * good bet most of the time. So for now, don't add it to FSM yet. > */ > - RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); > + RelationSetTargetBlock(relation, targetBlock); > > Is this related to this patch? If not, I suggest let's do it > separately if required. I will separate this out. > 3. > static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, > -uint8 newValue, uint8 minValue); > + uint8 newValue, uint8 minValue); > > This appears to be a spurious change. It was intentional, but I will include it separately as above. > 4. > @@ -378,24 +386,15 @@ RelationGetBufferForTuple(Relation relation, Size > len, > * target. > */ > targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); > + > + /* > + * In case we used an in-memory map of available blocks, reset > + * it for next use. > + */ > + if (targetBlock < HEAP_FSM_CREATION_THRESHOLD) > + ClearLocalMap(); > > How will you clear the local map during error? I think you need to > clear it in abort path and you can name the function as > FSMClearLocalMap or something like that. That sounds right, and I will rename the function that way. For the abort path, were you referring to this or somewhere else? if (!PageIsNew(page)) elog(ERROR, "page %u of relation \"%s\" should be empty but is not", targetBlock, RelationGetRelationName(relation)); > 5. > +/*#define TRACE_TARGETBLOCK */ > > Debugging leftover, do you want to retain this and related stuff > during the development of patch? I modeled this after TRACE_VISIBILITYMAP in visibilitymap.c. It's useful for development, but I don't particularly care whether it's in the final verision. Also, I found an off-by-one error that caused an unnecessary smgrexists() call in tables with threshold + 1 pages. This will be fixed in the next version. -John Naylor
Re: WIP: Avoid creation of the free space map for small tables
On 11/19/18, Amit Kapila wrote: > On Mon, Nov 19, 2018 at 7:30 AM John Naylor wrote: >> Let's say we have to wait to acquire a relation extension lock, >> because another backend had already started extending the heap by 1 >> block. We call GetPageWithFreeSpace() and now the local map looks like >> >> 0123 >> TTA0 >> >> By using bitwise OR to set availability, the already-tried blocks >> remain as they are. With only 2 states, the map would look like this >> instead: >> >> 0123 >> AAAN >> > In my mind for such a case it should look like below: > 0123 > NNAN Okay, to retain that behavior with only 2 status codes, I have implemented the map as a struct with 2 members: the cached number of blocks, plus the same array I had before. This also allows a more efficient implementation at the micro level. I just need to do some more testing on it. [ abortive states ] > I think it might come from any other place between when you set it and > before it got cleared (like any intermediate buffer and pin related > API's). Okay, I will look into that. > One other thing that slightly bothers me is the call to > RelationGetNumberOfBlocks via fsm_allow_writes. It seems that call > will happen quite frequently in this code-path and can have some > performance impact. As of now, I don't have any idea to avoid it or > reduce it more than what you already have in the patch, but I think we > should try some more to avoid it. Let me know if you have any ideas > around that? FWIW, I believe that the callers of RecordPageWithFreeSpace() will almost always avoid that call. Otherwise, there is at least one detail that could use attention: If rel->rd_rel->relpages shows fewer pages than the threshold, than the code doesn't trust it to be true. Might be worth revisiting. Aside from that, I will have to think about it. More generally, I have a couple ideas about performance: 1. Only mark available every other block such that visible blocks are interleaved as the relation extends. To explain, this diagram shows a relation extending, with 1 meaning marked available and 0 meaning marked not-available. A NA ANA NANA So for a 3-block table, we never check block 1. Any free space it has acquired will become visible when it extends to 4 blocks. For a 4-block threshold, we only check 2 blocks or less. This reduces the number of lock/pin events but still controls bloat. We could also check both blocks of a 2-block table. 2. During manual testing I seem to remember times that the FSM code was invoked even though I expected the smgr entry to have a cached target block. Perhaps VACUUM or something is clearing that away unnecessarily. It seems worthwhile to verify and investigate, but that seems like a separate project. -John Naylor
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On Tue, Jan 8, 2019 at 3:04 PM Tom Lane wrote: > > I'll take a crack at separating into a module. I'll wait a bit in > > case there are any stylistic suggestions on the patch as it stands. > > I had a go at that myself. I'm sure there's plenty to criticize in > the result, but at least it passes make check-world ;-) Just a couple comments about the module: -If you qualify the function's module name as you did (PerfectHash::generate_hash_function), you don't have to export the function into the callers namespace, so you can skip the @EXPORT_OK setting. Most of our modules don't export. -There is a bit of a cognitive clash between $case_sensitive in gen_keywordlist.pl and $case_insensitive in PerfectHash.pm. They each make sense in their own file, but might it be worth using one or the other? -As for the graph algorithm, I'd have to play with it to understand how it works. In the committed keyword patch, I noticed that in common/keywords.c, the array length is defined with ScanKeywordCategories[SCANKEYWORDS_NUM_KEYWORDS] but other keyword arrays just have ...[]. Is there a reason for the difference?
Re: WIP: Avoid creation of the free space map for small tables
On Sun, Dec 30, 2018 at 10:50 PM Amit Kapila wrote: > > On Sun, Dec 30, 2018 at 3:49 AM John Naylor wrote: > > > I also tried to insert more > > > records till 8 pages and same regression is observed! So I guess even > > > HEAP_FSM_CREATION_THRESHOLD = 4 is not perfect! > > > > That's curious, because once the table exceeds the threshold, it would > > be allowed to update the FSM, and in the process write 3 pages that it > > didn't have to in the 4 page test. The master branch has the FSM > > already, so I would expect the 8 page case to regress more. > > > > It is not clear to me why you think there should be regression at 8 > pages when HEAP_FSM_CREATION_THRESHOLD is 4. Basically, once FSM > starts getting updated, we should be same as HEAD as it won't take any > special path? In this particular test, the FSM is already created ahead of time for the master branch, so we can compare accessing FSM versus checking every page. My reasoning is that passing the threshold would take some time to create 3 FSM pages with the patch, leading to a larger regression. It seems we don't observe this, however. On Sun, Dec 30, 2018 at 10:59 PM Mithun Cy wrote: > I have some minor comments for pg_upgrade patch > 1. Now we call stat main fork file in transfer_relfile() > +sret = stat(old_file, ); > > +/* Save the size of the first segment of the main fork. */ > +if (type_suffix[0] == '\0' && segno == 0) > +first_seg_size = statbuf.st_size; > > But we do not handle the case if stat has returned any error! How about this: /* Did file open fail? */ if (stat(old_file, ) != 0) { /* Extent, fsm, or vm does not exist? That's OK, just return */ if (errno == ENOENT && (type_suffix[0] != '\0' || segno != 0)) return first_seg_size; else pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, strerror(errno)); } /* Save the size of the first segment of the main fork. */ else if (type_suffix[0] == '\0' && segno == 0) first_seg_size = statbuf.st_size; /* If extent, fsm, or vm is empty, just return */ else if (statbuf.st_size == 0) return first_seg_size; > 2. src/bin/pg_upgrade/pg_upgrade.h > > char *relname; > + > +charrelkind;/* relation relkind -- see pg_class.h */ > > I think we can remove the added empty line. In the full context: -/* the rest are used only for logging and error reporting */ + +/* These are used only for logging and error reporting. */ char *nspname;/* namespaces */ char *relname; + +charrelkind;/* relation relkind -- see pg_class.h */ Relkind is not used for logging or error reporting, so the space sets it apart from the previous members. I could instead put relkind before those other two... -John Naylor
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On Wed, Jan 9, 2019 at 5:33 PM Tom Lane wrote: > really need here. We could go with "[no-]case-insensitive", perhaps. > Or "[no-]case-fold", which is at least a little shorter and less > double-negative-y. I'd be in favor of --[no-]case-fold. On Tue, Jan 8, 2019 at 5:53 PM Tom Lane wrote: > I improved the comment about how come the hash table entry assignment > works. I've gone over the algorithm in more detail and I don't see any nicer way to write it. This comment in PerfectHash.pm: (It does change '_', else we could just skip adjusting # $cn here at all, for typical keyword strings.) ...seems a bit out of place in the module, because of its reference to keywords, of interest right now to its only caller. Maybe a bit of context here. (I also don't quite understand why we could hypothetically skip the adjustment.) Lastly, the keyword headers still have a dire warning about ASCII order and binary search. Those could be softened to match the comment in gen_keywordlist.pl. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On 1/3/19, Amit Kapila wrote: > Yeah, that makes sense, John, can you provide a patch on top of the > current patch where we check either the last block or every other > block. I've attached two patches for testing. Each one applies on top of the current patch. Mithun, I'll respond to your other review comments later this week. -John Naylor diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 2ab1cb58b7..d6352aabd9 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -1103,15 +1103,9 @@ fsm_allow_writes(Relation rel, BlockNumber heapblk, static void fsm_local_set(Relation rel, BlockNumber curr_nblocks) { - BlockNumber blkno, -cached_target_block; + BlockNumber cached_target_block; - /* - * Mark blocks available starting after the last block we have mapped, - * and ending at the current last block in the relation. - */ - for (blkno = fsm_local_map.nblocks; blkno < curr_nblocks; blkno++) - fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL; + fsm_local_map.map[curr_nblocks - 1] = FSM_LOCAL_AVAIL; /* Cache the number of blocks. */ fsm_local_map.nblocks = curr_nblocks; diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 2ab1cb58b7..820a2579ae 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -1110,8 +1110,15 @@ fsm_local_set(Relation rel, BlockNumber curr_nblocks) * Mark blocks available starting after the last block we have mapped, * and ending at the current last block in the relation. */ - for (blkno = fsm_local_map.nblocks; blkno < curr_nblocks; blkno++) + blkno = curr_nblocks - 1; + for (;;) + { fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL; + if (blkno >= fsm_local_map.nblocks + 2) + blkno -= 2; + else + break; + } /* Cache the number of blocks. */ fsm_local_map.nblocks = curr_nblocks; diff --git a/src/test/regress/expected/fsm.out b/src/test/regress/expected/fsm.out index 4c44c4bd6e..973b9319e7 100644 --- a/src/test/regress/expected/fsm.out +++ b/src/test/regress/expected/fsm.out @@ -2,16 +2,16 @@ -- Free Space Map test -- CREATE TABLE fsm_check_size (num int, str text); --- Fill 2 blocks with as many large records as will fit +-- Fill 3 blocks with as many large records as will fit -- No FSM INSERT INTO fsm_check_size SELECT g, rpad('', 1024, 'a') -FROM generate_series(1,7*2) g; +FROM generate_series(1,7*3) g; VACUUM fsm_check_size; SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size, pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; heap_size | fsm_size ---+-- - 16384 |0 + 24576 |0 (1 row) -- Clear some space on block 0 @@ -26,7 +26,7 @@ SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size, pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; heap_size | fsm_size ---+-- - 16384 |0 + 24576 |0 (1 row) -- Extend table with enough blocks to exceed the FSM threshold diff --git a/src/test/regress/sql/fsm.sql b/src/test/regress/sql/fsm.sql index fad99da1c2..a864c7fd88 100644 --- a/src/test/regress/sql/fsm.sql +++ b/src/test/regress/sql/fsm.sql @@ -4,10 +4,10 @@ CREATE TABLE fsm_check_size (num int, str text); --- Fill 2 blocks with as many large records as will fit +-- Fill 3 blocks with as many large records as will fit -- No FSM INSERT INTO fsm_check_size SELECT g, rpad('', 1024, 'a') -FROM generate_series(1,7*2) g; +FROM generate_series(1,7*3) g; VACUUM fsm_check_size; SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size, pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; @@ -16,7 +16,7 @@ pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; DELETE FROM fsm_check_size where num <= 5; VACUUM fsm_check_size; --- Insert small record in block 1 to set the cached smgr targetBlock +-- Insert small record in block 2 to set the cached smgr targetBlock INSERT INTO fsm_check_size values(99, 'b'); -- Insert large record and make sure it goes in block 0 rather than
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 1/6/19, Tom Lane wrote: > I've pushed that version (v8 + max_kw_len); if the buildfarm doesn't > fall over, we can move on with looking at hashing. Thank you. The API adjustment looks good, and I'm glad that splitting out the aux info led to even further cleanups. -John Naylor
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/30/18, Andres Freund wrote: > I tried to take this for a spin, an for me the build fails because various > frontend programs don't have KeywordOffsets/Strings defined, but reference > it > through various functions exposed to the frontend (like fmtId()). That I > see > that error but you don't is probably related to me using -fuse-ld=gold in > CFLAGS. > > I can "fix" this by including kwlist_d.h in common/keywords.c > regardless of FRONTEND. That also lead me to discover that the build > dependencies somewhere aren't correctly set-up, because I need to > force a clean rebuild to trigger the problem again, just changing > keywords.c back doesn't trigger the problem. Hmm, that was a typo, and I didn't notice even when I found I had to include kwlist_d.h in ecpg/keywords.c. :-( I've fixed both of those in the attached v6. As far as dependencies, I'm far from sure I have it up to par. That piece could use some discussion. On 1/4/19, Tom Lane wrote: > Aside from the possible linkage problem, this will need a minor rebase > over 4879a5172, which rearranged some of plpgsql's calls of > ScanKeywordLookup. > > While I don't think it's going to be hard to resolve these issues, > I'm wondering where we want to go with this. Is anyone excited > about pursuing the perfect-hash-function idea? (Joerg's example > function looked pretty neat to me.) If we are going to do that, > does it make sense to push this version beforehand? If it does, for v6 I've also done the rebase, updated the copyright year, and fixed an error in MSVC. -John Naylor From fb846eeabd450b6932e57f7e8d4f0aebc125d178 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 4 Jan 2019 14:35:40 -0500 Subject: [PATCH v6] Use offset-based keyword lookup. Replace binary search over an array of ScanKeyword structs with a binary search over an array of offsets into a keyword string. Access auxillary data only after a keyword hit. This has better locality of reference and a smaller memory footprint. --- .../pg_stat_statements/pg_stat_statements.c | 2 + src/backend/parser/parser.c | 4 +- src/backend/parser/scan.l | 38 ++-- src/backend/utils/adt/misc.c | 2 +- src/backend/utils/adt/ruleutils.c | 9 +- src/common/.gitignore | 1 + src/common/Makefile | 25 ++- src/common/keywords.c | 42 +++-- src/fe_utils/string_utils.c | 9 +- src/include/common/keywords.h | 19 +- src/include/parser/kwlist.h | 2 +- src/include/parser/scanner.h | 8 +- src/interfaces/ecpg/preproc/.gitignore| 2 + src/interfaces/ecpg/preproc/Makefile | 20 ++- src/interfaces/ecpg/preproc/c_keywords.c | 65 ++- src/interfaces/ecpg/preproc/c_kwlist.h| 53 ++ src/interfaces/ecpg/preproc/ecpg_keywords.c | 81 ++--- src/interfaces/ecpg/preproc/ecpg_kwlist.h | 68 +++ src/interfaces/ecpg/preproc/keywords.c| 6 +- src/interfaces/ecpg/preproc/pgc.l | 22 +-- src/interfaces/ecpg/preproc/preproc_extern.h | 8 +- src/pl/plpgsql/src/.gitignore | 2 + src/pl/plpgsql/src/Makefile | 16 +- src/pl/plpgsql/src/pl_reserved_kwlist.h | 55 ++ src/pl/plpgsql/src/pl_scanner.c | 167 -- src/pl/plpgsql/src/pl_unreserved_kwlist.h | 111 src/tools/gen_keywords.pl | 136 ++ src/tools/msvc/Solution.pm| 44 + src/tools/msvc/clean.bat | 6 + 29 files changed, 696 insertions(+), 327 deletions(-) create mode 100644 src/common/.gitignore create mode 100644 src/interfaces/ecpg/preproc/c_kwlist.h create mode 100644 src/interfaces/ecpg/preproc/ecpg_kwlist.h create mode 100644 src/pl/plpgsql/src/pl_reserved_kwlist.h create mode 100644 src/pl/plpgsql/src/pl_unreserved_kwlist.h create mode 100644 src/tools/gen_keywords.pl diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index e8ef966bb5..4776de7595 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -3076,6 +3076,8 @@ fill_in_constant_lengths(pgssJumbleState *jstate, const char *query, yyscanner = scanner_init(query, , ScanKeywords, + KeywordString, + KeywordOffsets, NumScanKeywords); /* we don't want to re-emit any escape string warnings */ diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index 7e9b1222fd..19fa7bb8ae 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -40,8 +40,8 @@ raw_parser(const char *str) int yyresult; /* initialize the flex scanner */ - yys
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 1/3/19, Joerg Sonnenberger wrote: > Hello John, > I was pointed at your patch on IRC and decided to look into adding my > own pieces. What I can provide you is a fast perfect hash function > generator. I've attached a sample hash function based on the current > main keyword list. hash() essentially gives you the number of the only > possible match, a final strcmp/memcmp is still necessary to verify that > it is an actual keyword though. The |0x20 can be dropped if all cases > have pre-lower-cased the input already. This would replace the binary > search in the lookup functions. Returning offsets directly would be easy > as well. That allows writing a single string where each entry is prefixed > with a type mask, the token id, the length of the keyword and the actual > keyword text. Does that sound useful to you? Judging by previous responses, there is still interest in using perfect hash functions, so thanks for this. I'm not knowledgeable enough to judge its implementation, so I'll leave that for others. -John Naylor
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/27/18, Tom Lane wrote: > If you really are hot about saving that other 440 bytes, the way to > do it would be to drop the struct entirely and use two parallel > arrays, an int16[] for value and a char[] (or better uint8[]) for > category. Those would be filled by reading kwlist.h twice with > different definitions for PG_KEYWORD. Not sure it's worth the > trouble though --- in particular, not clear that it's a win from > the standpoint of number of cache lines touched. Understood. That said, after re-implementing all keyword lookups, I wondered if there'd be a notational benefit to dropping the struct, especially since as yet no caller uses both token and category. It makes pl_scanner.c and its reserved keyword list a bit nicer, and gets rid of the need to force frontend to have 'zero' token numbers, but I'm not sure it's a clear win. I've attached a patch (applies on top of v6), gzipped to avoid confusing the cfbot. -John Naylor keyword-nostruct.patch.gz Description: GNU Zip compressed data
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Jan 9, 2019 at 10:50 PM Amit Kapila wrote: > Thanks, Mithun for performance testing, it really helps us to choose > the right strategy here. Once John provides next version, it would be > good to see the results of regular pgbench (read-write) runs (say at > 50 and 300 scale factor) and the results of large copy. I don't think > there will be any problem, but we should just double check that. Attached is v12 using the alternating-page strategy. I've updated the comments and README as needed. In addition, I've -handled a possible stat() call failure during pg_upgrade -added one more assertion -moved the new README material into a separate paragraph -added a comment to FSMClearLocalMap() about transaction abort -corrected an outdated comment that erroneously referred to extension rather than creation -fleshed out the draft commit messages From 854ba27740ed26e0d3e89f6228186f2b0914b21e Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 10 Jan 2019 16:37:57 -0500 Subject: [PATCH v12 1/2] Avoid creation of the free space map for small heap relations. Previously, all heaps had FSMs. For very small tables, this means that the FSM took up more space than the heap did. This is wasteful, so now we refrain from creating the FSM for heaps with 4 pages or fewer. If the last known target block has insufficient space, we still try to insert into some other page before before giving up and extending the relation, since doing otherwise leads to table bloat. Testing showed that trying every page penalized performance slightly, so we compromise and try every other page. This way, we visit at most two pages. Any pages with wasted free space become visible at next relation extension, so we still control table bloat. As a bonus, directly attempting one or two pages can even be faster than consulting the FSM would have been. If a heap with a FSM is truncated back to below the threshold, the FSM stays around and can be used as usual. --- contrib/pageinspect/expected/page.out | 77 +++ contrib/pageinspect/sql/page.sql | 33 +-- doc/src/sgml/storage.sgml | 13 +- src/backend/access/brin/brin.c| 2 +- src/backend/access/brin/brin_pageops.c| 10 +- src/backend/access/heap/hio.c | 47 ++-- src/backend/access/transam/xact.c | 14 ++ src/backend/commands/vacuumlazy.c | 17 +- src/backend/storage/freespace/README | 13 +- src/backend/storage/freespace/freespace.c | 262 +- src/backend/storage/freespace/indexfsm.c | 6 +- src/include/storage/freespace.h | 9 +- src/test/regress/expected/fsm.out | 62 + src/test/regress/parallel_schedule| 6 + src/test/regress/serial_schedule | 1 + src/test/regress/sql/fsm.sql | 46 16 files changed, 516 insertions(+), 102 deletions(-) create mode 100644 src/test/regress/expected/fsm.out create mode 100644 src/test/regress/sql/fsm.sql diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3fcd9fbe6d..83e5910453 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -1,48 +1,69 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); -VACUUM test1; -- set up FSM +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT g from generate_series(1,1) g; +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; main_0 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; -ERROR: block number 1 is out of range for relation "test1" -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; +ERROR: block number 100 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; fsm_0 --- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; - fsm_1 - 8192 -(1 row) - -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10; +ERROR: block number 10 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; vm_0 -- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; -ERROR: block number 1 is out of range for relation "test1" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm
Re: unnecessary creation of FSM files during bootstrap mode
Thought I'd ping... (sorry for the top post) On Sat, Dec 15, 2018 at 12:02 AM Amit Kapila wrote: > > As pointed out by John Naylor [1], it seems during bootstrap mode, we > are always creating FSM files which are not required. In commit's > b9d01fe288 and 3908473c80, we have added some code where we allowed > the creation of files during mdopen even if they didn't exist during > the bootstrap mode. The comments in the code say: "During bootstrap, > there are cases where a system relation will be accessed (by internal > backend processes) before the bootstrap script nominally creates it." > I am sure this will be the case when that code is added but is it > required today? While going through commit 3908473c80, I came across > below comment: > > - * During bootstrap processing, we skip that check, because pg_time, > - * pg_variable, and pg_log get created before their .bki file entries > - * are processed. > - */ > + fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); > > The system tables mentioned in above commit are not present today, so > do we really need that code and even if it is required shall we do it > only for 'main' or 'init' forks? > > Tom, as you are a committer of the commits b9d01fe288 and 3908473c80, > do you remember anything in this regard? > > > [1] - > https://www.postgresql.org/message-id/CAJVSVGVtf%2B-2sQGVyEZJQh15dpVicpFA6BBiPLugaD4oBEaiHg%40mail.gmail.com > > -- > With Regards, > Amit Kapila. > EnterpriseDB: http://www.enterprisedb.com >
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Jan 9, 2019 at 7:33 AM Mithun Cy wrote: > 2. v11-Every-other-page and v11-last-page patches improve the > performance from base. > 3. IMHO v11-Every-other-page would be ideal to consider it improves > the performance and also to an extent avoid expansion if space is > already available. Good to hear. I'll clean up the every-other-page patch and include it in my next version. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On Wed, Jan 9, 2019 at 2:04 PM Tom Lane wrote: > > I wrote: > > John Naylor writes: > >> -There is a bit of a cognitive clash between $case_sensitive in > >> gen_keywordlist.pl and $case_insensitive in PerfectHash.pm. They each > >> make sense in their own file, but might it be worth using one or the > >> other? > Working on the fmgr-oid-lookup idea gave me the thought that > PerfectHash.pm ought to support fixed-length keys. Rather than start > adding random parameters to the function, I borrowed an idea from > PostgresNode.pm and made the options be keyword-style parameters. Now > the impedance mismatch about case sensitivity is handled with > > my $f = PerfectHash::generate_hash_function(\@keywords, $funcname, > case_insensitive => !$case_sensitive); > > which is at least a little clearer than before, though I'm not sure > if it entirely solves the problem. It's a bit clearer, but thinking about this some more, it makes sense for gen_keywordlist.pl to use $case_insensitive, because right now every instance of the var is "!$case_sensitive". In the attached (on top of v4), I change the command line option to --citext, and add the ability to negate it within the option, as '--no-citext'. It's kind of a double negative for the C-keywords invocation, but we can have the option for both cases, so we don't need to worry about what the default is (which is case_insensitive=1). -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services diff --git a/src/common/Makefile b/src/common/Makefile index d0c2b970eb..5f9327de59 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -124,7 +124,7 @@ libpgcommon_srv.a: $(OBJS_SRV) # generate SQL keyword lookup table to be included into keywords*.o. kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS) - $(GEN_KEYWORDLIST) --extern $< + $(GEN_KEYWORDLIST) --extern --citext $< # Dependencies of keywords*.o need to be managed explicitly to make sure # that you don't get broken parsing code, even in a non-enable-depend build. diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile index 6c02f97622..96dd76317f 100644 --- a/src/interfaces/ecpg/preproc/Makefile +++ b/src/interfaces/ecpg/preproc/Makefile @@ -60,10 +60,10 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg. # generate keyword headers c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS) - $(GEN_KEYWORDLIST) --varname ScanCKeywords --case $< + $(GEN_KEYWORDLIST) --varname ScanCKeywords --no-citext $< ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS) - $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $< + $(GEN_KEYWORDLIST) --varname ScanECPGKeywords --citext $< # Force these dependencies to be known even without dependency info built: ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h diff --git a/src/pl/plpgsql/src/Makefile b/src/pl/plpgsql/src/Makefile index 8a0f294323..bc4ee50682 100644 --- a/src/pl/plpgsql/src/Makefile +++ b/src/pl/plpgsql/src/Makefile @@ -80,10 +80,10 @@ plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.p # generate keyword headers for the scanner pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST_DEPS) - $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $< + $(GEN_KEYWORDLIST) --varname ReservedPLKeywords --citext $< pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST_DEPS) - $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $< + $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords --citext $< check: submake diff --git a/src/tools/gen_keywordlist.pl b/src/tools/gen_keywordlist.pl index 2744e1dc26..927511d7c4 100644 --- a/src/tools/gen_keywordlist.pl +++ b/src/tools/gen_keywordlist.pl @@ -35,13 +35,13 @@ use PerfectHash; my $output_path = ''; my $extern = 0; -my $case_sensitive = 0; +my $case_insensitive = 1; my $varname = 'ScanKeywords'; GetOptions( 'output:s' => \$output_path, 'extern' => \$extern, - 'case-sensitive' => \$case_sensitive, + 'citext!'=> \$case_insensitive, 'varname:s' => \$varname) || usage(); my $kw_input_file = shift @ARGV || die "No input file.\n"; @@ -97,7 +97,7 @@ while (<$kif>) } # When being case-insensitive, insist that the input be all-lower-case. -if (!$case_sensitive) +if ($case_insensitive) { foreach my $kw (@keywords) { @@ -157,7 +157,7 @@ printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords; my $funcname = $varname . "_hash_func"; my $f = PerfectHash::generate_hash_function(\@keywords, $funcname, - case_insensitive => !$case_sensitive); + case_insensitive => $case_insensitive); printf $kwdef qq|static %s\n|, $f
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On Tue, Jan 8, 2019 at 5:31 PM Tom Lane wrote: > > John Naylor writes: > > In the committed keyword patch, I noticed that in common/keywords.c, > > the array length is defined with > > ScanKeywordCategories[SCANKEYWORDS_NUM_KEYWORDS] > > but other keyword arrays just have ...[]. Is there a reason for the > > difference? > > The length macro was readily available there so I used it. AFAIR > that wasn't true elsewhere, though I might've missed something. > It's pretty much just belt-and-suspenders coding anyway, since all > those arrays are machine generated ... I tried using the available num_keywords macro in plpgsql and it worked fine, but it makes the lines really long. Alternatively, as in the attached, we could remove the single use of the core macro and maybe add comments to the generated magic numbers. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services diff --git a/src/common/keywords.c b/src/common/keywords.c index 84f779feb9..edec4a5094 100644 --- a/src/common/keywords.c +++ b/src/common/keywords.c @@ -26,7 +26,7 @@ #define PG_KEYWORD(kwname, value, category) category, -const uint8 ScanKeywordCategories[SCANKEYWORDS_NUM_KEYWORDS] = { +const uint8 ScanKeywordCategories[] = { #include "parser/kwlist.h" }; diff --git a/src/tools/gen_keywordlist.pl b/src/tools/gen_keywordlist.pl index 927511d7c4..8c4c5da828 100644 --- a/src/tools/gen_keywordlist.pl +++ b/src/tools/gen_keywordlist.pl @@ -147,11 +147,6 @@ foreach my $name (@keywords) print $kwdef "};\n\n"; -# Emit a macro defining the number of keywords. -# (In some places it's useful to have access to that as a constant.) - -printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords; - # Emit the definition of the hash function. my $funcname = $varname . "_hash_func"; @@ -168,8 +163,8 @@ printf $kwdef "const ScanKeywordList %s = {\n", $varname; printf $kwdef qq|\t%s_kw_string,\n|, $varname; printf $kwdef qq|\t%s_kw_offsets,\n|, $varname; printf $kwdef qq|\t%s,\n|, $funcname; -printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname; -printf $kwdef qq|\t%d\n|, $max_len; +printf $kwdef qq|\t%d,\t/* number of keywords */\n|, scalar @keywords; +printf $kwdef qq|\t%d\t/* max keyword length */\n|, $max_len; printf $kwdef "};\n\n"; printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On Wed, Jan 9, 2019 at 2:44 PM Tom Lane wrote: > [patch to shrink oid index] It would help maintaining its newfound sveltness if we warned if a higher oid was assigned, as in the attached. I used 6200 as a soft limit, but that could be anything similiar. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index e7ead7dcc0..201343c2f6 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -74,6 +74,7 @@ my %catalog_data; my @toast_decls; my @index_decls; my %oidcounts; +my $max_proc_oid = 6200; foreach my $header (@input_files) { @@ -230,6 +231,13 @@ foreach my $row (@{ $catalog_data{pg_opfamily} }) my %procoids; foreach my $row (@{ $catalog_data{pg_proc} }) { + # Warn if a large OID has been newly assigned. + if ($row->{oid} > $max_proc_oid) + { + warn sprintf "Consider using an OID smaller than %d for %s (see commit 8ff5f824dca75)\n", + $max_proc_oid, $row->{proname}; + } + # Generate an entry under just the proname (corresponds to regproc lookup) my $prokey = $row->{proname}; if (defined $procoids{$prokey})
automatically assigning catalog toast oids
Commit 96cdeae07 added toast tables to most catalogs. One disadvantage is that the toast declarations require hard-coded oids, even though only shared catalogs actually need stable oids. Now that we can assign oids on the fly, it makes sense to do so for toast tables as well, as in the attached. -John Naylor src/backend/catalog/genbki.pl | 26 ++ src/include/catalog/toasting.h | 32 ++-- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index edc8ea9f53..f4d7160c6a 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -285,6 +285,7 @@ my @tables_needing_macros; foreach my $catname (@catnames) { my $catalog = $catalogs{$catname}; + my $has_toastable_type = 0; # Create one definition header with macro definitions for each catalog. my $def_file = $output_path . $catname . '_d.h'; @@ -345,6 +346,12 @@ EOM # Build hash of column names for use later $attnames{$attname} = 1; + # Flag catalogs with toastable datatypes. + if ($types{$atttype}->{typstorage} eq 'x') + { + $has_toastable_type = 1; + } + # Emit column definitions if (!$first) { @@ -506,6 +513,25 @@ EOM } } + # Create toast declarations for normal catalogs. To prevent + # circular dependencies and to avoid adding complexity to VACUUM + # FULL logic, exclude pg_class, pg_attribute, and pg_index. Also, + # to prevent pg_upgrade from seeing a non-empty new cluster, exclude + # pg_largeobject and pg_largeobject_metadata from the set as large + # object data is handled as user data. Those relations have no reason + # to use a toast table anyway. Toast tables and indexes for shared + # catalogs need stable oids, so must be specified in toasting.h. + if ($has_toastable_type and !$catalog->{shared_relation} + and $catname ne 'pg_class' + and $catname ne 'pg_attribute' + and $catname ne 'pg_index' + and $catname ne 'pg_largeobject' + and $catname ne 'pg_largeobject_metadata') + { + push @toast_decls, sprintf "declare toast %s %s on %s\n", + $maxoid++, $maxoid++, $catname; + } + print $bki "close $catname\n"; printf $def "\n#endif\t\t\t\t\t\t\t/* %s_D_H */\n", uc $catname; diff --git a/src/include/catalog/toasting.h b/src/include/catalog/toasting.h index f259890e43..1c389c685d 100644 --- a/src/include/catalog/toasting.h +++ b/src/include/catalog/toasting.h @@ -42,37 +42,9 @@ extern void BootstrapToastTable(char *relName, * the OID to assign to the toast table, and the OID to assign to the * toast table's index. The reason we hard-wire these OIDs is that we * need stable OIDs for shared relations, and that includes toast tables - * of shared relations. + * of shared relations. Toast table commands for normal catalogs are + * generated by genbki.pl, and oids for those are assigned on the fly. */ - -/* normal catalogs */ -DECLARE_TOAST(pg_aggregate, 4159, 4160); -DECLARE_TOAST(pg_attrdef, 2830, 2831); -DECLARE_TOAST(pg_collation, 4161, 4162); -DECLARE_TOAST(pg_constraint, 2832, 2833); -DECLARE_TOAST(pg_default_acl, 4143, 4144); -DECLARE_TOAST(pg_description, 2834, 2835); -DECLARE_TOAST(pg_event_trigger, 4145, 4146); -DECLARE_TOAST(pg_extension, 4147, 4148); -DECLARE_TOAST(pg_foreign_data_wrapper, 4149, 4150); -DECLARE_TOAST(pg_foreign_server, 4151, 4152); -DECLARE_TOAST(pg_foreign_table, 4153, 4154); -DECLARE_TOAST(pg_init_privs, 4155, 4156); -DECLARE_TOAST(pg_language, 4157, 4158); -DECLARE_TOAST(pg_namespace, 4163, 4164); -DECLARE_TOAST(pg_partitioned_table, 4165, 4166); -DECLARE_TOAST(pg_policy, 4167, 4168); -DECLARE_TOAST(pg_proc, 2836, 2837); -DECLARE_TOAST(pg_rewrite, 2838, 2839); -DECLARE_TOAST(pg_seclabel, 3598, 3599); -DECLARE_TOAST(pg_statistic, 2840, 2841); -DECLARE_TOAST(pg_statistic_ext, 3439, 3440); -DECLARE_TOAST(pg_trigger, 2336, 2337); -DECLARE_TOAST(pg_ts_dict, 4169, 4170); -DECLARE_TOAST(pg_type, 4171, 4172); -DECLARE_TOAST(pg_user_mapping, 4173, 4174); - -/* shared catalogs */ DECLARE_TOAST(pg_authid, 4175, 4176); #define PgAuthidToastTable 4175 #define PgAuthidToastIndex 4176
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/19/18, Andrew Gierth wrote: > Is there any particular reason not to go further and use a perfect hash > function for the lookup, rather than binary search? When I was investigating faster algorithms, I ruled out gperf based on discussions in the archives. The approach here has modest goals and shouldn't be too invasive. With the makefile support and separate keyword files in place, that'll be one less thing to do if we ever decide to replace binary search. The giant string will likely be useful as well. Since we're on the subject, I think some kind of trie would be ideal performance-wise, but a large amount of work. The nice thing about a trie is that it can be faster then a hash table for a key miss. I found a paper that described some space-efficient trie variations [1], but we'd likely have to code the algorithm and a way to emit a C code representation of it. I've found some libraries, but that would have more of the same difficulties in practicality that gperf had. [1] https://infoscience.epfl.ch/record/64394/files/triesearches.pdf -John Naylor
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
On 12/18/18, Tom Lane wrote: > I'd be kind of inclined to convert all uses of ScanKeyword to the new way, > if only for consistency's sake. On the other hand, I'm not the one > volunteering to do the work. That's reasonable, as long as the design is nailed down first. Along those lines, attached is a heavily WIP patch that only touches plpgsql unreserved keywords, to test out the new methodology in a limited area. After settling APIs and name/directory bikeshedding, I'll move on to the other four keyword types. There's a new Perl script, src/common/gen_keywords.pl, which takes pl_unreserved_kwlist.h as input and outputs pl_unreserved_kwlist_offset.h and pl_unreserved_kwlist_string.h. The output headers are not installed or symlinked anywhere. Since the input keyword lists will never be #included directly, they might be better as .txt files, like errcodes.txt. If we went that far, we might also remove the PG_KEYWORD macros (they'd still be in the output files) and rename parser/kwlist.h to common/core_kwlist.txt. There's also a case for not changing things unnecessarily, especially if there's ever a new reason to include the base keyword list directly. To keep the other keyword types functional, I had to add a separate new struct ScanKeywordOffset and new function ScanKeywordLookupOffset(), so the patch is a bit messier than the final will be. With a 4-byte offset, ScankeyWordOffset is 8 bytes, down from 12, and is now a power of 2. I used the global .gitignore, but maybe that's an abuse of it. Make check passes, but I don't know how well it stresses keyword use. I'll create a commitfest entry soon. -John Naylor .gitignore| 2 + src/common/gen_keywords.pl| 151 ++ src/common/keywords.c | 55 +++ src/include/common/keywords.h | 14 +++ src/pl/plpgsql/src/Makefile | 13 ++- src/pl/plpgsql/src/pl_scanner.c | 108 - src/pl/plpgsql/src/pl_unreserved_kwlist.h | 107 + 7 files changed, 355 insertions(+), 95 deletions(-) diff --git a/.gitignore b/.gitignore index 794e35b73c..8a9a05c20a 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,8 @@ win32ver.rc *.exe lib*dll.def lib*.pc +*kwlist_offset.h +*kwlist_string.h # Local excludes in root directory /GNUmakefile diff --git a/src/common/gen_keywords.pl b/src/common/gen_keywords.pl new file mode 100644 index 00..a8a8576d43 --- /dev/null +++ b/src/common/gen_keywords.pl @@ -0,0 +1,151 @@ +#-- +# +# gen_keywords.pl +# Perl script that generates *_offset.h and *_string.h from a given +# keyword list file. These headers are then included into files that +# call ScanKeywordLookup() on that keyword list. The keyword name is +# is represented as an offset into a single string. +# +# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/commen/gen_keywords.pl +# +#-- + + +my $output_path = ''; +my $kw_input_file; + +# Process command line switches. +while (@ARGV) +{ + my $arg = shift @ARGV; + if ($arg !~ /^-/) + { + $kw_input_file = $arg; + } + elsif ($arg =~ /^-o/) + { + $output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; + } + else + { + usage(); + } +} + + +# Make sure output_path ends in a slash. +if ($output_path ne '' && substr($output_path, -1) ne '/') +{ + $output_path .= '/'; +} + +$kw_input_file =~ /(\w*kwlist)\.h/; +my $base_filename = $1; + +my $kw_offset_file = $output_path . $base_filename . '_offset.h'; +my $kw_string_file = $output_path . $base_filename . '_string.h'; + +open(my $kif, '<', $kw_input_file) || die "$kw_input_file: $!"; +open(my $kof, '>', $kw_offset_file) || die "$kw_offset_file: $!"; +open(my $ksf, '>', $kw_string_file) || die "$kw_string_file: $!"; + +# Opening boilerplate for keyword offset header. +printf $kof <) +{ + if (/^PG_KEYWORD\("(\w+)",\s+(\w+),\s+(\w+)\)/) + { + $name = $1; + $value = $2; + $category = $3; + + push @keywords, $name; + + # Emit ScanKeyword macros with numerical offsets instead of text. + print $kof "PG_KEYWORD($offset, $value, $category)\n"; + + # Calculate the cumulative offset of the next keyword, + # taking into account the null terminator. + $offset += length($name) +1; + } +} + +# TODO: Error out if the keyword names are not in ASCII order. + +printf $ksf qq|const char *%s_string =\n"|, $base_filename; +print $ksf join qq|\\0"\n"|, @keywords; +printf $ksf qq|";\n\n#endif\t\t\t\t\t\t\t/* %s_STRING_H */\n|, uc $base_filename; + + +sub usage +{ + die <= NAMEDATALEN) + return NULL; + + /* + * Apply an ASCII-only downcasing. We must not use to
Re: Delay locking partitions during INSERT and UPDATE
On Wed, Jan 23, 2019 at 7:56 PM David Rowley wrote: > On Thu, 24 Jan 2019 at 13:38, John Naylor wrote: > > Hmm, now instead of an 85x speedup over master in the 10k partition > > case, I only get 20x. Anyone else see this? > > What's it like with fsync off? No change. Just in case, I tried git bisect and also recreated the cluster, but never got the same performance as my first test, so not sure what happened. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: Delay locking partitions during INSERT and UPDATE
On Thu, Jan 24, 2019 at 4:17 PM Tomas Vondra wrote: > I can still see about the same performance as before (on both clusters). Thanks for checking! I think the thing to do now is have a committer look at it. It's a small patch with obvious performance effects -- there's just the question of whether the locking behavior with concurrent DDL is as safe as it is now. Anyone have anything else? -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Jan 23, 2019 at 11:17 PM Amit Kapila wrote: > > On Thu, Jan 24, 2019 at 3:39 AM John Naylor > wrote: > > mkdir -p data1 data2 standby > > > > echo 'heap' > data1/foo > > echo 'fsm' > data1/foo_fsm > > > > # simulate streaming replication > > rsync --archive data1 standby > > > > # simulate pg_upgrade, skipping FSM > > ln data1/foo -t data2/ > > > > rsync --archive --delete --hard-links --size-only --no-inc-recursive > > data1 data2 standby > > > > # result > > ls standby/data1 > > ls standby/data2 > > > > > > The result is that foo_fsm is not copied to standby/data2, contrary to > > what the docs above imply for other unlinked files. Can anyone shed > > light on this? > > > > Is foo_fsm present in standby/data1? Yes it is. > I think what doc means to say is > that it copies any unlinked files present in primary's new cluster > (which in your case will be data2). In that case, I'm still confused why that doc says, "Unfortunately, rsync needlessly copies files associated with temporary and unlogged tables because these files don't normally exist on standby servers." I fail to see why the primary's new cluster would have these if they weren't linked. And in the case we're discussing here, the skipped FSMs won't be on data2, so won't end up in standby/data2. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: Delay locking partitions during INSERT and UPDATE
The cfbot reports this patch no longer applies. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Sat, Jan 26, 2019 at 2:14 PM Amit Kapila wrote: > > On Sat, Jan 26, 2019 at 5:05 AM John Naylor > wrote: > > > > So, in v19 we check pg_class.relpages and if it's > > a heap and less than or equal the threshold we call stat on the 0th > > segment to verify. > > > > Okay, but the way logic is implemented appears clumsy to me. > The function transfer_relfile has no clue about skipping of FSM stuff, > but it contains comments about it. Yeah, I wasn't entirely happy with how that turned out. > I think there is some value in using the information from > this function to skip fsm files, but the code doesn't appear to fit > well, how about moving this check to new function > new_cluster_needs_fsm()? For v21, new_cluster_needs_fsm() has all responsibility for obtaining the info it needs. I think this is much cleaner, but there is a small bit of code duplication since it now has to form the file name. One thing we could do is form the the base old/new file names in transfer_single_new_db() and pass those to transfer_relfile(), which will only add suffixes and segment numbers. We could then pass the base old file name to new_cluster_needs_fsm() and use it as is. Not sure if that's worthwhile, though. > The order in which relkind and relpages is used in the above code is > different from the order in which it is mentioned in the query, it > won't matter, but keeping in order will make look code consistent. I > have made this and some more minor code adjustments in the attached > patch. If you like those, you can include them in the next version of > your patch. Okay, done. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services From 2b70b65f43c05234b81e939b0ed49e34b97d5667 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 27 Jan 2019 21:42:07 +0100 Subject: [PATCH v21 3/3] During pg_upgrade, conditionally skip transfer of FSMs. If a heap on the old cluster has 4 pages or fewer, and the old cluster was PG v11 or earlier, don't copy or link the FSM. This will shrink space usage for installations with large numbers of small tables. --- src/bin/pg_upgrade/info.c| 16 +++-- src/bin/pg_upgrade/pg_upgrade.h | 6 src/bin/pg_upgrade/relfilenode.c | 58 ++-- 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 2f925f086c..902bfc647e 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -200,6 +200,8 @@ create_rel_filename_map(const char *old_data, const char *new_data, map->old_db_oid = old_db->db_oid; map->new_db_oid = new_db->db_oid; + map->relpages = old_rel->relpages; + map->relkind = old_rel->relkind; /* * old_relfilenode might differ from pg_class.oid (and hence @@ -418,6 +420,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) char *nspname = NULL; char *relname = NULL; char *tablespace = NULL; + char *relkind = NULL; int i_spclocation, i_nspname, i_relname, @@ -425,7 +428,9 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_indtable, i_toastheap, i_relfilenode, -i_reltablespace; +i_reltablespace, +i_relpages, +i_relkind; char query[QUERY_ALLOC]; char *last_namespace = NULL, *last_tablespace = NULL; @@ -494,7 +499,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) */ snprintf(query + strlen(query), sizeof(query) - strlen(query), "SELECT all_rels.*, n.nspname, c.relname, " - " c.relfilenode, c.reltablespace, %s " + " c.relfilenode, c.reltablespace, c.relpages, c.relkind, %s " "FROM (SELECT * FROM regular_heap " " UNION ALL " " SELECT * FROM toast_heap " @@ -525,6 +530,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_relname = PQfnumber(res, "relname"); i_relfilenode = PQfnumber(res, "relfilenode"); i_reltablespace = PQfnumber(res, "reltablespace"); + i_relpages = PQfnumber(res, "relpages"); + i_relkind = PQfnumber(res, "relkind"); i_spclocation = PQfnumber(res, "spclocation"); for (relnum = 0; relnum < ntups; relnum++) @@ -556,6 +563,11 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) curr->relname = pg_strdup(relname); curr->relfilenode = atooid(PQgetvalue(res, relnum, i_relfilenode)); + curr->relpages = atoi(PQgetvalue(res, relnum, i_relpages)); + + relkind = PQgetvalue(res, relnum, i_relkind); + curr->relkind = relkind[0]; + curr->tblsp_alloc = false; /* Is the tablespace oid non-default? */ diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 2f67eee22b..baeb8ff0f8 100644 --- a/src/bin/pg_upgrade/pg_upgrade
Re: WIP: Avoid creation of the free space map for small tables
On Mon, Jan 28, 2019 at 3:53 AM Amit Kapila wrote: > On Thu, Jan 24, 2019 at 9:14 AM Amit Kapila wrote: > > Sure, apart from this I have run pgindent on the patches and make some > > changes accordingly. Latest patches attached (only second patch has > > some changes). I will take one more pass on Monday morning (28th Jan) > > and will commit unless you or others see any problem. > > Pushed these two patches. Thank you for your input and detailed review! Thank you Mithun for testing! -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Mon, Jan 28, 2019 at 4:53 AM Amit Kapila wrote: > There are a few buildfarm failures due to this commit, see my email on > pgsql-committers. If you have time, you can also once look into > those. I didn't see anything in common with the configs of the failed members. None have a non-default BLCKSZ that I can see. Looking at this typical example from woodlouse: == pgsql.build/src/test/regress/regression.diffs == --- C:/buildfarm/buildenv/HEAD/pgsql.build/src/test/regress/expected/fsm.out 2019-01-28 04:43:09.031456700 +0100 +++ C:/buildfarm/buildenv/HEAD/pgsql.build/src/test/regress/results/fsm.out 2019-01-28 05:06:20.351100400 +0100 @@ -26,7 +26,7 @@ pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; heap_size | fsm_size ---+-- - 24576 |0 + 32768 |0 (1 row) ***It seems like the relation extended when the new records should have gone into block 0. -- Extend table with enough blocks to exceed the FSM threshold @@ -56,7 +56,7 @@ SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; fsm_size -- -16384 +24576 (1 row) ***And here it seems vacuum didn't truncate the FSM. I wonder if the heap didn't get truncated either. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: pgsql: Avoid creation of the free space map for small heap relations.
On Mon, Jan 28, 2019 at 6:58 AM Amit Kapila wrote: > > On Mon, Jan 28, 2019 at 11:10 AM Andrew Gierth > you just need to create free space on a page that didn't have enough > > before? It might be worth tweaking the fillfactor rather than trying to > > delete anything. > > > > No, it also depends on vacuum to remove rows and then truncate the > relation accordingly. That particular test could be removed -- it's just verifying behavior that's already been there for years and is a direct consquence of normal truncation combined with the addressing scheme of the FSM logic. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: [HACKERS] PATCH: multivariate histograms and MCV lists
I believe I found a typo in mcv.c, fix attached. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services mcv-comment-fix.patch Description: Binary data
Re: pgsql: Avoid creation of the free space map for small heap relations, t
On Tue, Feb 26, 2019 at 8:59 AM Amit Kapila wrote: > I will look into it today and respond back with my findings. John, > see if you also get time to investigate this. Will do, but it will likely be tomorrow -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Thu, Feb 21, 2019 at 7:58 AM Amit Kapila wrote: > So here you are inserting 4-byte integer and 1024-bytes variable > length record. So the tuple length will be tuple_header (24-bytes) + > 4-bytes for integer + 4-bytes header for variable length data + 1024 > bytes of actual data. So, the length will be 1056 which is already > MAXALIGN. I took the new comments added in your latest version of the > patch and added them to the previous version of the patch. Kindly > see if I have not missed anything while merging the patch-versions? OK, that makes sense. Looks fine to me. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: pgsql: Avoid creation of the free space map for small heap relations, t
On Wed, Feb 27, 2019 at 5:06 AM Amit Kapila wrote: > I have tried this test many times (more than 1000 times) by varying > thread count, but couldn't reproduce it. My colleague, Kuntal has > tried a similar test overnight, but the issue didn't reproduce which > is not surprising to me seeing the nature of the problem. As I could > reproduce it via the debugger, I think we can go ahead with the fix. > I have improved the comments in the attached patch and I have also > tried to address Tom's concern w.r.t comments by adding additional > comments atop of data-structure used to maintain the local map. The flaw in my thinking was treating extension too similarly too finding an existing block. With your patch clearing the local map in the correct place, it seems the call at hio.c:682 is now superfluous? It wasn't sufficient before, so should this call be removed so as not to mislead? Or maybe keep it to be safe, with a comment like "This should already be cleared by now, but make sure it is." + * To maintain good performance, we only mark every other page as available + * in this map. I think this implementation detail is not needed to comment on the struct. I think the pointer to the README is sufficient. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: pgsql: Avoid creation of the free space map for small heap relations, t
On Tue, Feb 26, 2019 at 10:28 AM Amit Kapila wrote: > John, others, can you review my findings and patch? I can confirm your findings with your debugging steps. Nice work! -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Fri, Jan 25, 2019 at 9:50 AM Amit Kapila wrote: > Once we agree on the code, we need to test below scenarios: > (a) upgrade from all supported versions to the latest version > (b) upgrade standby with and without using rsync. Although the code hasn't been reviewed yet, I went ahead and tested (a) on v21 of the pg_upgrade patch [1]. To do this I dumped out a 9.4 instance with the regression database and restored it to all supported versions. To make it work with pg_upgrade, I first had to drop tables with oids, drop functions referring to C libraries, and drop the later-removed '=>' operator. Then I pg_upgrade'd in copy mode from all versions to HEAD with the patch applied. pg_upgrade worked without error, and the following query returned 0 bytes on all the new clusters: select sum(pg_relation_size(oid, 'fsm')) as total_fsm_size from pg_class where relkind in ('r', 't') and pg_relation_size(oid, 'main') <= 4 * 8192; The complementary query (> 4 * 8192) returned the same number of bytes as in the original 9.4 instance. To make it easy to find, the latest regression test patch, which is intended to be independent of block-size, was in [2]. [1] https://www.postgresql.org/message-id/CACPNZCu4cOdm3uGnNEGXivy7Gz8UWyQjynDpdkPGabQ18_zK6g%40mail.gmail.com [2] https://www.postgresql.org/message-id/CACPNZCsWa%3Ddd0K%2BFiODwM%3DLEsepAHVJCoSx2Avew%3DxBEX3Ywjw%40mail.gmail.com -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: Why don't we have a small reserved OID range for patch revisions?
I wrote: > Along those lines, here's a draft patch to do just that. It handles > array type oids as well. Run it like this: > > perl reformat_dat_file.pl --map-from 9000 --map-to 2000 *.dat > > There is some attempt at documentation. So far it doesn't map by > default, but that could be changed if we agreed on the convention of > 9000 or whatever. In case we don't want to lose track of this, I added it to the March commitfest with a target of v13. (I didn't see a way to add it to the July commitfest) -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: pgsql: Avoid creation of the free space map for small heap relations, t
On Thu, Feb 28, 2019 at 7:24 AM Amit Kapila wrote: > > The flaw in my thinking was treating extension too similarly too > > finding an existing block. With your patch clearing the local map in > > the correct place, it seems the call at hio.c:682 is now superfluous? > > What if get some valid block from the first call to > GetPageWithFreeSpace via local map which has required space? I think > in that case we will need the call at hio.c:682. Am I missing > something? Are you referring to the call at line 393? Then the map will be cleared on line 507 before we return that buffer. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: pgsql: Avoid creation of the free space map for small heap relations, t
On Thu, Feb 28, 2019 at 10:25 AM Amit Kapila wrote: > > Here's an updated patch based on comments by you. I will proceed with > this unless you have any more comments. Looks good to me. I would just adjust the grammar in the comment, from "This prevents us to use the map", to "This prevents us from using the map". Perhaps also a comma after "first". -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Thu, Feb 21, 2019 at 9:27 PM Alvaro Herrera wrote: > > I think this test is going to break on nonstandard block sizes. While > we don't promise that all tests work on such installs (particularly > planner ones), it seems fairly easy to cope with this one -- just use a > record size expressed as a fraction of current_setting('block_size'). > So instead of "1024" you'd write current_setting('block_size') / 8. > And then display the relation size in terms of pages, not bytes, so > divide pg_relation_size by block size. I've done this for v6, tested on 16k block size. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services From 157f74155beb53c455b0a85b6eebdd22a60ce5f5 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sat, 23 Feb 2019 08:48:31 +0100 Subject: [PATCH v6] Add more tests for FSM. In commit b0eaa4c51bb, we left out a test that used a vacuum to remove dead rows as the behavior of test was not predictable. This test has been rewritten to use fillfactor instead to control free space. Since we no longer need to remove dead rows as part of the test, put the fsm regression test in a parallel group. --- src/test/regress/expected/fsm.out | 59 +- src/test/regress/parallel_schedule | 8 +--- src/test/regress/serial_schedule | 2 +- src/test/regress/sql/fsm.sql | 41 + 4 files changed, 77 insertions(+), 33 deletions(-) diff --git a/src/test/regress/expected/fsm.out b/src/test/regress/expected/fsm.out index b02993188c..698d4b0be5 100644 --- a/src/test/regress/expected/fsm.out +++ b/src/test/regress/expected/fsm.out @@ -1,15 +1,40 @@ -- -- Free Space Map test -- +SELECT current_setting('block_size')::integer AS blocksize, +current_setting('block_size')::integer / 8 AS strsize +\gset CREATE TABLE fsm_check_size (num int, str text); --- With one block, there should be no FSM -INSERT INTO fsm_check_size VALUES(1, 'a'); +-- Fill 3 blocks with one record each +ALTER TABLE fsm_check_size SET (fillfactor=15); +INSERT INTO fsm_check_size SELECT i, rpad('', :strsize, 'a') +FROM generate_series(1,3) i; +-- There should be no FSM VACUUM fsm_check_size; -SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size, -pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; - heap_size | fsm_size +-- - 8192 |0 +SELECT pg_relation_size('fsm_check_size', 'main') / :blocksize AS heap_nblocks, +pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks; + heap_nblocks | fsm_nblocks +--+- +3 | 0 +(1 row) + +-- The following operations are for testing the functionality of the local +-- in-memory map. In particular, we want to be able to insert into some +-- other block than the one at the end of the heap, without using a FSM. +-- Fill most of the last block +ALTER TABLE fsm_check_size SET (fillfactor=100); +INSERT INTO fsm_check_size SELECT i, rpad('', :strsize, 'a') +FROM generate_series(101,105) i; +-- Make sure records can go into any block but the last one +ALTER TABLE fsm_check_size SET (fillfactor=30); +-- Insert large record and make sure it does not cause the relation to extend +INSERT INTO fsm_check_size VALUES (111, rpad('', :strsize, 'a')); +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'main') / :blocksize AS heap_nblocks, +pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks; + heap_nblocks | fsm_nblocks +--+- +3 | 0 (1 row) -- Extend table with enough blocks to exceed the FSM threshold @@ -26,23 +51,23 @@ num = 11; END; $$; VACUUM fsm_check_size; -SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; - fsm_size --- -24576 +SELECT pg_relation_size('fsm_check_size', 'fsm') / :blocksize AS fsm_nblocks; + fsm_nblocks +- + 3 (1 row) -- Add long random string to extend TOAST table to 1 block INSERT INTO fsm_check_size VALUES(0, (SELECT string_agg(md5(chr(i)), '') - FROM generate_series(1,100) i)); + FROM generate_series(1, :blocksize / 100) i)); VACUUM fsm_check_size; -SELECT pg_relation_size(reltoastrelid, 'main') AS toast_size, -pg_relation_size(reltoastrelid, 'fsm') AS toast_fsm_size +SELECT pg_relation_size(reltoastrelid, 'main') / :blocksize AS toast_nblocks, +pg_relation_size(reltoastrelid, 'fsm') / :blocksize AS toast_fsm_nblocks FROM pg_class WHERE relname = 'fsm_check_size'; - toast_size | toast_fsm_size -+ - 8192 | 0 + toast_nblocks | toast_fsm_nblocks +---+--- + 1 | 0 (1 row) DROP TABLE fsm_check_size; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 4051a4ad4e..a956775dd1 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/re
Re: WIP: Avoid creation of the free space map for small tables
On Fri, Feb 22, 2019 at 3:59 AM Amit Kapila wrote: > The reason for not pushing much on making the test pass for > nonstandard block sizes is that when I tried existing tests, there > were already some failures. FWIW, I currently see 8 failures (attached). -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services jcn-16blk-regress.diffs Description: Binary data
outdated reference to tuple header OIDs
It seems this is a leftover from commit 578b229718e8. Patch attached. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services remove-ref-to-sys-oid-col.patch Description: Binary data
Re: WIP: Avoid creation of the free space map for small tables
On Thu, Mar 14, 2019 at 2:17 PM Amit Kapila wrote: > > 1. Added an Assert in new_cluster_needs_fsm() that old cluster version > should be >= 804 as that is where fsm support has been added. There is already an explicit check for 804 in the caller, and the HEAD code is already resilient to FSMs not existing, so I think this is superfluous. > 2. Reverted the old cluster version check to <= 1100. There was > nothing wrong in the way you have written a check, but I find most of > the other places in the code uses that way. > 3. At one place changed the function header to be consistent with the > nearby code, run pgindent on the code and changed the commit message. Looks good to me, thanks. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: Why don't we have a small reserved OID range for patch revisions?
On Tue, Mar 12, 2019 at 5:36 AM Tom Lane wrote: > This seems committable from my end --- any further comments? I gave it a read and it looks good to me, but I haven't tried to run it. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Fri, Mar 8, 2019 at 7:43 PM Amit Kapila wrote: > Few minor comments: > 1. > warning C4715: 'new_cluster_needs_fsm': not all control paths return a value > > Getting this new warning in the patch. Hmm, I don't get that in a couple versions of gcc. Your compiler must not know that pg_fatal() cannot return. I blindly added a fix. > 2. > > This comment line doesn't seem to be related to this patch. If so, I > think we can avoid having any additional change which is not related > to the functionality of this patch. Feel free to submit it > separately, if you think it is an improvement. > + > + /* Transfer any VM files if we can trust their contents. */ > if (vm_crashsafe_match) Well, I guess the current comment is still ok, so reverted. If I were to do a separate cleanup patch, I would rather remove the vm_must_add_frozenbit parameter -- there's no reason I can see for calls that transfer the heap and FSM to know about this. I also changed references to the 'first segment of the main fork' where there will almost always only be one segment. This was a vestige of the earlier algorithm I had. > 3. Can we add a note about this in the Notes section of pg_upgrade > documentation [1]? Done. > Have you done any performance testing of this patch? I mean to say > now that we added a new stat call for each table, we should see if > that has any impact. Ideally, that should be compensated by the fact > that we are now not transferring *fsm files for small relations. How > about constructing a test where all relations are greater than 4 pages > and then try to upgrade them. We can check for a cluster with a > different number of relations say 10K, 20K, 50K, 100K. I have not, but I agree it should be done. I will try to do so soon. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services From b65084daf15d198c9fdd986992b0147290c8e690 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 10 Mar 2019 22:01:21 +0800 Subject: [PATCH v23] During pg_upgrade, conditionally skip transfer of FSMs. If a heap on the old cluster has 4 pages or fewer, and the old cluster was PG v11 or earlier, don't copy or link the FSM. This will shrink space usage for installations with large numbers of small tables. --- doc/src/sgml/ref/pgupgrade.sgml | 7 src/bin/pg_upgrade/info.c| 16 +++-- src/bin/pg_upgrade/pg_upgrade.h | 6 src/bin/pg_upgrade/relfilenode.c | 58 +++- 4 files changed, 84 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 7e1afaf0a5..c896882dd1 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -792,6 +792,13 @@ psql --username=postgres --file=script.sql postgres is down. + + In PostgreSQL 12 and later small tables by + default don't have a free space map, as a space optimization. If you are + upgrading a pre-12 cluster, the free space maps of small tables will + likewise not be transferred to the new cluster. + + diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 2f925f086c..902bfc647e 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -200,6 +200,8 @@ create_rel_filename_map(const char *old_data, const char *new_data, map->old_db_oid = old_db->db_oid; map->new_db_oid = new_db->db_oid; + map->relpages = old_rel->relpages; + map->relkind = old_rel->relkind; /* * old_relfilenode might differ from pg_class.oid (and hence @@ -418,6 +420,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) char *nspname = NULL; char *relname = NULL; char *tablespace = NULL; + char *relkind = NULL; int i_spclocation, i_nspname, i_relname, @@ -425,7 +428,9 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_indtable, i_toastheap, i_relfilenode, -i_reltablespace; +i_reltablespace, +i_relpages, +i_relkind; char query[QUERY_ALLOC]; char *last_namespace = NULL, *last_tablespace = NULL; @@ -494,7 +499,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) */ snprintf(query + strlen(query), sizeof(query) - strlen(query), "SELECT all_rels.*, n.nspname, c.relname, " - " c.relfilenode, c.reltablespace, %s " + " c.relfilenode, c.reltablespace, c.relpages, c.relkind, %s " "FROM (SELECT * FROM regular_heap " " UNION ALL " " SELECT * FROM toast_heap " @@ -525,6 +530,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_relname = PQfnumber(res, "relname"); i_relfilenode = PQfnumber(res, "relfilenode"); i_reltablespace = PQfnumber(res, "reltablespace"); + i_relpages = PQfnumber(res, "relpages&quo
Re: Why don't we have a small reserved OID range for patch revisions?
On Sat, Mar 9, 2019 at 1:14 AM Tom Lane wrote: > I took a quick look at this. I went ahead and pushed the parts that > were just code cleanup in reformat_dat_file.pl, since that seemed > pretty uncontroversial. As far as the rest of it goes: Okay, thanks. > * I'm really not terribly happy with sticking this functionality into > reformat_dat_file.pl. First, there's an issue of discoverability: > it's not obvious that a script named that way would have such an > ability. Second, it clutters the script in a way that seems to me > to hinder its usefulness as a basis for one-off hacks. So I'd really > rather have a separate script named something like "renumber_oids.pl", > even if there's a good deal of code duplication between it and > reformat_dat_file.pl. > * In my vision of what this might be good for, I think it's important > that it be possible to specify a range of input OIDs to renumber, not > just "everything above N". I agree the output range only needs a > starting OID. Now it looks like: perl renumber_oids.pl --first-mapped-oid 8000 --last-mapped-oid 8999 --first-target-oid 2000 *.dat To prevent a maintenance headache, I didn't copy any of the formatting logic over. You'll also have to run reformat_dat_files.pl afterwards to restore that. It seems to work, but I haven't tested thoroughly. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services From c6f5a1f6016feeb7f7ea94e504be6d23a264ca07 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 11 Mar 2019 01:05:15 +0800 Subject: [PATCH v2] Provide script to renumber OIDs. Historically, It's been considered good practice to choose an OID that's at the beginning of the range shown by the unused_oids script, but nowadays that space is getting cramped, leading to merge conflicts. Instead, allow developers to use high-numbered OIDs, and remap them to a lower range at some future date. --- doc/src/sgml/bki.sgml| 16 +++ src/include/catalog/renumber_oids.pl | 193 +++ 2 files changed, 209 insertions(+) create mode 100644 src/include/catalog/renumber_oids.pl diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 3c5830bc8a..7e553dc30d 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -392,6 +392,22 @@ at compile time.) + +For a variety of reasons, newly assigned OIDs should occupy the lower +end of the reserved range. Because multiple patches considered for +inclusion into PostgreSQL could be using +the same new OIDs, there is the possibilty of conflict. To mitigate +this, there is a convention that OIDs 8000 and over are reserved for +development. This reduces the effort in rebasing over the HEAD branch. +To enable committers to move these OIDs to the lower range easily, +renumber_oids.pl can map OIDs as in the following, +following, which maps any OIDs between 8000 and 8999, inclusive, to +unused OIDs starting at 2000: + +$ perl renumber_oids.pl --first-mapped-oid 8000 --last-mapped-oid 8999 --first-target-oid 2000 *.dat + + + The OID counter starts at 1 at the beginning of a bootstrap run. If a row from a source other than postgres.bki diff --git a/src/include/catalog/renumber_oids.pl b/src/include/catalog/renumber_oids.pl new file mode 100644 index 00..79537465bc --- /dev/null +++ b/src/include/catalog/renumber_oids.pl @@ -0,0 +1,193 @@ +#!/usr/bin/perl +#-- +# +# renumber_oids.pl +#Perl script that shifts a range of high-numbered OIDs in the given +#catalog data file(s) to a lower range. +# +#Note: This does not format the output, so you will still need to +#run reformat_dat_file.pl. +# +# Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/include/catalog/renumber_oids.pl +# +#-- + +use strict; +use warnings; + +use FindBin; +use Getopt::Long; +use Data::Dumper; +$Data::Dumper::Terse = 1; + +# If you copy this script to somewhere other than src/include/catalog, +# you'll need to modify this "use lib" or provide a suitable -I switch. +use lib "$FindBin::RealBin/../../backend/catalog/"; +use Catalog; + +# Must run in src/include/catalog +chdir $FindBin::RealBin or die "could not cd to $FindBin::RealBin: $!\n"; + +my $FirstGenbkiObjectId = + Catalog::FindDefinedSymbol('access/transam.h', '..', + 'FirstGenbkiObjectId'); + +# Process command line switches. +my $output_path = ''; +my $first_mapped_oid; +my $last_mapped_oid = $FirstGenbkiObjectId - 1; +my $first_target_oid = 1; + +GetOptions( + 'output:s' => \$output_path, + 'first-m
Re: WIP: Avoid creation of the free space map for small tables
On Fri, Mar 8, 2019 at 7:43 PM Amit Kapila wrote: > Have you done any performance testing of this patch? I mean to say > now that we added a new stat call for each table, we should see if > that has any impact. Ideally, that should be compensated by the fact > that we are now not transferring *fsm files for small relations. To be precise, it will only call stat if pg_class.relpages is below the threshold. I suppose I could hack a database where all the relpages values are wrong, but that seems like a waste of time. > How > about constructing a test where all relations are greater than 4 pages > and then try to upgrade them. We can check for a cluster with a > different number of relations say 10K, 20K, 50K, 100K. I did both greater and less than 4 pages for 10k relations. Since pg_upgrade is O(# relations), I don't see a point in going higher. First, I had a problem: On MacOS with their "gcc" wrapper around clang, I got a segfault 11 when compiled with no debugging symbols. I added "CFLAGS=-O0" and it worked fine. Since this doesn't happen in a debugging build, I'm not sure how to investigate this. IIRC, this doesn't happen for me on Linux gcc. Since it was at least running now, I measured by putting gettimeofday() calls around transfer_all_new_tablespaces(). I did 10 runs each and took the average, except for patch/1-page case since it was obviously faster after a couple runs. 5 pages: masterpatch 5.59s 5.64s The variation within the builds is up to +/- 0.2s, so there is no difference, as expected. 1 page: masterpatch 5.62s 4.25s Clearly, linking is much slower than stat. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: WIP: Avoid creation of the free space map for small tables
On Wed, Mar 13, 2019 at 8:18 PM Amit Kapila wrote: > > First, I had a problem: On MacOS with their "gcc" wrapper around > > clang, I got a segfault 11 when compiled with no debugging symbols. > > > > Did you get this problem with the patch or both with and without the > patch? If it is only with patch, then we definitely need to > investigate. Only with the patch. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: jsonpath
On Sun, Feb 24, 2019 at 5:03 PM Alexander Korotkov wrote: > On Wed, Jan 30, 2019 at 5:28 AM Andres Freund wrote: > > Why -CF, and why is -p repeated? > > BTW, for our SQL grammar we have > > > scan.c: FLEXFLAGS = -CF -p -p > > Is it kind of default? I just saw this in the committed patch. This is not default, it's chosen for maximum performance at the expense of binary/memory size. That's fine, but with a little effort you can also make the scanner non-backtracking for additional performance, as in the attached. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services jsonpath-scan-nobackup.patch Description: Binary data
Re: jsonpath
On Thu, Mar 21, 2019 at 9:59 PM Alexander Korotkov wrote: > Attaches patches improving jsonpath parser. First one introduces > cosmetic changes, while second gets rid of backtracking. I'm also > planning to add high-level comment for both grammar and lexer. The cosmetic changes look good to me. I just noticed a couple things about the comments. 0001: +/* Check if current scanstring value constitutes a keyword */ 'is a keyword' is better. 'Constitutes' implies parts of a whole. + * Resize scanstring for appending of given length. Reinitilize if required. s/Reinitilize/Reinitialize/ The first sentence is not entirely clear to me. 0002: These two rules are not strictly necessary: {unicode}+\\ { /* throw back the \\, and treat as unicode */ yyless(yyleng - 1); parseUnicode(yytext, yyleng); } {hex_char}+\\ { /* throw back the \\, and treat as hex */ yyless(yyleng - 1); parseHexChars(yytext, yyleng); } ...and only seem to be there because of how these are written: {unicode}+ { parseUnicode(yytext, yyleng); } {hex_char}+ { parseHexChars(yytext, yyleng); } {unicode}*{unicodefail} { yyerror(NULL, "Unicode sequence is invalid"); } {hex_char}*{hex_fail} { yyerror(NULL, "Hex character sequence is invalid"); } I don't understand the reasoning here -- is it a micro-optimization? The following is simpler, allow the rules I mentioned to be removed, and make check still passes. I would prefer it unless there is a performance penalty, in which case a comment to describe the additional complexity would be helpful. {unicode} { parseUnicode(yytext, yyleng); } {hex_char} { parseHexChars(yytext, yyleng); } {unicodefail} { yyerror(NULL, "Unicode sequence is invalid"); } {hex_fail} { yyerror(NULL, "Hex character sequence is invalid"); } -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: use Getopt::Long for catalog scripts
On Thu, Feb 7, 2019 at 6:53 PM David Fetter wrote: > [some style suggestions] I've included these in v2, and also some additional tweaks for consistency. -- John Naylorhttps://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services From 0bf2b3ffdaa16d85856074dda8e15c09fc99d0fe Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 7 Feb 2019 20:10:50 +0100 Subject: [PATCH v2] Use Getopt::Long for catalog scripts Replace hand-rolled option parsing with the Getopt module. This is shorter and easier to read. In passing, make some cosmetic adjustments for consistency. --- src/backend/catalog/Makefile | 2 +- src/backend/catalog/genbki.pl| 49 +--- src/backend/utils/Gen_fmgrtab.pl | 38 - src/backend/utils/Makefile | 2 +- 4 files changed, 28 insertions(+), 63 deletions(-) diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index d9f92ba701..b06ee49ca8 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -89,7 +89,7 @@ generated-header-symlinks: $(top_builddir)/src/include/catalog/header-stamp # version number when it changes. bki-stamp: genbki.pl Catalog.pm $(POSTGRES_BKI_SRCS) $(POSTGRES_BKI_DATA) $(top_srcdir)/configure.in $(PERL) -I $(catalogdir) $< \ - -I $(top_srcdir)/src/include/ --set-version=$(MAJORVERSION) \ + --include-path=$(top_srcdir)/src/include/ --set-version=$(MAJORVERSION) \ $(POSTGRES_BKI_SRCS) touch $@ diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index be81094ffb..4935e00fb2 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -16,6 +16,7 @@ use strict; use warnings; +use Getopt::Long; use File::Basename; use File::Spec; @@ -23,43 +24,21 @@ BEGIN { use lib File::Spec->rel2abs(dirname(__FILE__)); } use Catalog; -my @input_files; my $output_path = ''; my $major_version; my $include_path; -# Process command line switches. -while (@ARGV) -{ - my $arg = shift @ARGV; - if ($arg !~ /^-/) - { - push @input_files, $arg; - } - elsif ($arg =~ /^-I/) - { - $include_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; - } - elsif ($arg =~ /^-o/) - { - $output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; - } - elsif ($arg =~ /^--set-version=(.*)$/) - { - $major_version = $1; - die "Invalid version string.\n" - if !($major_version =~ /^\d+$/); - } - else - { - usage(); - } -} +GetOptions( + 'output:s' => \$output_path, + 'set-version:s' => \$major_version, + 'include-path:s' => \$include_path) || usage(); # Sanity check arguments. -die "No input files.\n" if !@input_files; -die "--set-version must be specified.\n" if !defined $major_version; -die "-I, the header include path, must be specified.\n" if !$include_path; +die "No input files.\n" unless @ARGV; +die "--set-version must be specified.\n" unless $major_version; +die "Invalid version string: $major_version\n" + unless $major_version =~ /^\d+$/; +die "--include-path must be specified.\n" unless $include_path; # Make sure paths end with a slash. if ($output_path ne '' && substr($output_path, -1) ne '/') @@ -79,7 +58,7 @@ my @toast_decls; my @index_decls; my %oidcounts; -foreach my $header (@input_files) +foreach my $header (@ARGV) { $header =~ /(.+)\.h$/ or die "Input files need to be header files.\n"; @@ -917,12 +896,12 @@ sub form_pg_type_symbol sub usage { die <] [--include-path/-i ] header... Options: --I include path --o output path +--output Output directory (default '.') --set-versionPostgreSQL version number for initdb cross-check +--include-path Include path in source tree genbki.pl generates BKI files and symbol definition headers from specially formatted header files and .dat diff --git a/src/backend/utils/Gen_fmgrtab.pl b/src/backend/utils/Gen_fmgrtab.pl index f17a78ebcd..2ffacae666 100644 --- a/src/backend/utils/Gen_fmgrtab.pl +++ b/src/backend/utils/Gen_fmgrtab.pl @@ -18,32 +18,14 @@ use Catalog; use strict; use warnings; +use Getopt::Long; -# Collect arguments -my @input_files; my $output_path = ''; my $include_path; -while (@ARGV) -{ - my $arg = shift @ARGV; - if ($arg !~ /^-/) - { - push @input_files, $arg; - } - elsif ($arg =~ /^-o/) - { - $output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; - } - elsif ($arg =~ /^-I/) - { - $include_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; - } - else - { - usage(); - } -} +GetOptions( + 'output:s' => \$output_path, + 'include-path:s' => \$include_path) || usage(); # Make sure output_path ends in a slash. if ($output_path ne '' && substr($output_path, -1) ne '/') @@ -52,8 +34,8 @@ if ($output_path ne '' && subs