Re: [HACKERS] [PATCH] Fix ScalarArrayOpExpr estimation for GIN indexes

2011-12-21 Thread Marti Raudsepp
On Wed, Dec 21, 2011 at 03:03, Tom Lane t...@sss.pgh.pa.us wrote:
 I've applied a revised version of this patch that factors things in a
 way I found nicer.

Nice, thanks!

Regards,
Marti

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Leonardo Francalanci

I can't help in this discussion, but I have a question:
how different would this feature be from filesystem-level CRC, such as 
the one available in ZFS and btrfs?



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] patch: bytea_agg

2011-12-21 Thread Pavel Stehule
Hello

this patch adds a bytea_agg aggregation.

It allow fast bytea concatetation.

Regards

Pavel Stehule
*** ./doc/src/sgml/func.sgml.orig	2011-12-07 11:04:33.0 +0100
--- ./doc/src/sgml/func.sgml	2011-12-21 11:00:18.255753111 +0100
***
*** 10911,10916 
--- 10911,10934 
   row
entry
 indexterm
+ primarybytea_agg/primary
+/indexterm
+function
+  bytea_agg(replaceable class=parameterexpression/replaceable)
+/function
+   /entry
+   entry
+typebytea/type
+   /entry
+   entry
+typebytea/type
+   /entry
+   entryinput values concatenated into a bytea/entry
+  /row
+ 
+  row
+   entry
+indexterm
  primarycount/primary
 /indexterm
 functioncount(*)/function
*** ./src/backend/utils/adt/varlena.c.orig	2011-12-21 08:21:10.0 +0100
--- ./src/backend/utils/adt/varlena.c	2011-12-21 10:46:33.344807038 +0100
***
*** 396,401 
--- 396,448 
  	PG_RETURN_BYTEA_P(vlena);
  }
  
+ Datum
+ bytea_agg_transfn(PG_FUNCTION_ARGS)
+ {
+ 	StringInfo	state;
+ 
+ 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
+ 
+ 	/* Append the value unless null. */
+ 	if (!PG_ARGISNULL(1))
+ 	{
+ 		bytea	   *value = PG_GETARG_BYTEA_PP(1);
+ 
+ 		if (state == NULL)
+ 			state = makeStringAggState(fcinfo);
+ 
+ 		appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
+ 	}
+ 
+ 	/*
+ 	 * The transition type for bytea_agg() is declared to be internal,
+ 	 * which is a pass-by-value type the same size as a pointer.
+ 	 */
+ 	PG_RETURN_POINTER(state);
+ }
+ 
+ Datum
+ bytea_agg_finalfn(PG_FUNCTION_ARGS)
+ {
+ 	StringInfo	state;
+ 
+ 	/* cannot be called directly because of internal-type argument */
+ 	Assert(AggCheckCallContext(fcinfo, NULL));
+ 
+ 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
+ 
+ 	if (state != NULL)
+ 	{
+ 		bytea	   *result;
+ 
+ 		result = (bytea *) palloc(state-len + VARHDRSZ);
+ 		SET_VARSIZE(result, state-len + VARHDRSZ);
+ 		memcpy(VARDATA(result), state-data, state-len);
+ 		PG_RETURN_BYTEA_P(result);
+ 	}
+ 	else
+ 		PG_RETURN_NULL();
+ }
  
  /*
   *		textin			- converts ... to internal representation
*** ./src/include/catalog/pg_aggregate.h.orig	2011-12-07 11:04:33.0 +0100
--- ./src/include/catalog/pg_aggregate.h	2011-12-21 10:28:37.016877356 +0100
***
*** 226,231 
--- 226,234 
  /* text */
  DATA(insert ( 3538	string_agg_transfn	string_agg_finalfn		0	2281	_null_ ));
  
+ /* bytea */
+ DATA(insert ( 3545	bytea_agg_transfn	bytea_agg_finalfn		0	2281	_null_ ));
+ 
  /*
   * prototypes for functions in pg_aggregate.c
   */
*** ./src/include/catalog/pg_proc.h.orig	2011-12-21 08:21:10.0 +0100
--- ./src/include/catalog/pg_proc.h	2011-12-21 10:25:29.533889614 +0100
***
*** 2403,2414 
--- 2403,2421 
  DESCR(aggregate final function);
  DATA(insert OID = 2817 (  float8_corrPGNSP PGUID 12 1 0 0 0 f f f t f i 1 0 701 1022 _null_ _null_ _null_ _null_ float8_corr _null_ _null_ _null_ ));
  DESCR(aggregate final function);
+ 
  DATA(insert OID = 3535 (  string_agg_transfn		PGNSP PGUID 12 1 0 0 0 f f f f f i 3 0 2281 2281 25 25 _null_ _null_ _null_ _null_ string_agg_transfn _null_ _null_ _null_ ));
  DESCR(aggregate transition function);
  DATA(insert OID = 3536 (  string_agg_finalfn		PGNSP PGUID 12 1 0 0 0 f f f f f i 1 0 25 2281 _null_ _null_ _null_ _null_ string_agg_finalfn _null_ _null_ _null_ ));
  DESCR(aggregate final function);
  DATA(insert OID = 3538 (  string_aggPGNSP PGUID 12 1 0 0 0 t f f f f i 2 0 25 25 25 _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
  DESCR(concatenate aggregate input into a string);
+ DATA(insert OID = 3543 (  bytea_agg_transfn		PGNSP PGUID 12 1 0 0 0 f f f f f i 2 0 2281 2281 17 _null_ _null_ _null_ _null_ bytea_agg_transfn _null_ _null_ _null_ ));
+ DESCR(aggregate transition function);
+ DATA(insert OID = 3544 (  bytea_agg_finalfn		PGNSP PGUID 12 1 0 0 0 f f f f f i 1 0 17 2281 _null_ _null_ _null_ _null_ bytea_agg_finalfn _null_ _null_ _null_ ));
+ DESCR(aggregate final function);
+ DATA(insert OID = 3545 (  bytea_aggPGNSP PGUID 12 1 0 0 0 t f f f f i 1 0 17 17 _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
+ DESCR(concatenate aggregate input into a bytea);
  
  /* To ASCII conversion */
  DATA(insert OID = 1845 ( to_ascii	PGNSP PGUID 12 1 0 0 0 f f f t f i 1 0 25 25 _null_ _null_ _null_ _null_	to_ascii_default _null_ _null_ _null_ ));
*** ./src/include/utils/builtins.h.orig	2011-12-21 08:21:10.0 +0100
--- ./src/include/utils/builtins.h	2011-12-21 10:16:10.521926024 +0100
***
*** 769,774 
--- 769,776 
  
  extern Datum pg_column_size(PG_FUNCTION_ARGS);
  
+ extern Datum bytea_agg_transfn(PG_FUNCTION_ARGS);
+ extern Datum bytea_agg_finalfn(PG_FUNCTION_ARGS);
  extern Datum 

[HACKERS] patch: very simply optimalization of string_agg

2011-12-21 Thread Pavel Stehule
Hello

This remove a not necessary string to text overhead

Regards

Pavel Stehule
*** ./src/backend/utils/adt/varlena.c.orig	2011-12-21 10:46:33.0 +0100
--- ./src/backend/utils/adt/varlena.c	2011-12-21 11:08:42.583720151 +0100
***
*** 3668,3674 
  	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
  
  	if (state != NULL)
! 		PG_RETURN_TEXT_P(cstring_to_text(state-data));
  	else
  		PG_RETURN_NULL();
  }
--- 3668,3674 
  	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
  
  	if (state != NULL)
! 		PG_RETURN_TEXT_P(cstring_to_text_with_len(state-data, state-len));
  	else
  		PG_RETURN_NULL();
  }

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 5:33 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 Robert Haas robertmh...@gmail.com writes:
 ... while the main buffer manager is
 content with some loosey-goosey approximation of recency, the SLRU
 code makes a fervent attempt at strict LRU (slightly compromised for
 the sake of reduced locking in SimpleLruReadPage_Readonly).

 Oh btw, I haven't looked at that code recently, but I have a nasty
 feeling that there are parts of it that assume that the number of
 buffers it is managing is fairly small.  Cranking up the number
 might require more work than just changing the value.

My memory was that you'd said benchmarks showed NUM_CLOG_BUFFERS needs
to be low enough to allow fast lookups, since the lookups don't use an
LRU they just scan all buffers. Indeed, it was your objection that
stopped NUM_CLOG_BUFFERS being increased many years before this.

With the increased performance we have now, I don't think increasing
that alone will be that useful since it doesn't solve all of the
problems and (I am told) likely increases lookup speed.

The full list of clog problems I'm aware of is: raw lookup speed,
multi-user contention, writes at checkpoint and new xid allocation.

Would it be better just to have multiple SLRUs dedicated to the clog?
Simply partition things so we have 2^N sets of everything, and we look
up the xid in partition (xid % (2^N)).  That would overcome all of the
problems, not just lookup, in exactly the same way that we partitioned
the buffer and lock manager. We would use a graduated offset on the
page to avoid zeroing pages at the same time. Clog size wouldn't
increase, we'd have the same number of bits, just spread across 2^N
files. We'd have more pages too, but that's not a bad thing since it
spreads out the contention.

Code-wise, those changes would be isolated to clog.c only, probably a
days work if you like the idea.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Pause at end of recovery

2011-12-21 Thread Magnus Hagander
On Tue, Dec 20, 2011 at 18:15, Simon Riggs si...@2ndquadrant.com wrote:
 On Tue, Dec 20, 2011 at 1:40 PM, Magnus Hagander mag...@hagander.net wrote:
 These days we have pause_at_recovery_target, which lets us pause when
 we reach a PITR target. Is there a particular reason we don't have a
 way to pause at end of recovery if we *didn't* specify a target -
 meaning we let it run until the end of the archived log? While it's
 too late to change the target, I can see a lot of usescases where you
 don't want it to be possible to make changes to the database again
 until it has been properly verified - and keeping it up in readonly
 mode in that case can be quite useful...

 Useful for what purpose? It' s possible to deny access in other ways already.

For validating the restore, while allowing easy read-only access.

If you could declare a read-only connection in pg_hba.conf it would
give the same functionality, but you really can't...

I'm not saying it's a big feature. But the way it looks now it seems
to be artificially restricted from a usecase. Or is there a technical
reason why we don't allow it?

-- 
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] sorting table columns

2011-12-21 Thread Simon Riggs
On Tue, Dec 20, 2011 at 9:47 PM, Alvaro Herrera
alvhe...@commandprompt.com wrote:

  The idea described there by Tom, and upon which I formed a vague
  implementation plan in my head, is that I was to look for all uses of
  an attnum, and then replace it by either attlognum (i.e. the
  user-visible sort identifier) or attphysnum (i.e. the order of
  attributes as stored on disk).

 I thought we'd concluded that we really need three values: attnum should
 be a permanent logical ID for each column, and then the user-visible
 column order would be determined by a different number, and the on-disk
 column order by a third.  If we're going to do this at all, it seems
 like a seriously bad idea to only go halfway, because then we'll just
 have to revisit all the same code again later.

 Yeah, I was unclear -- that's what I'm doing (or, rather, attempting to
 do).

Sounds great.

While you're doing this, I'd like to think about future requirements,
to see if that changes anything.

Having a unique logical column id is a great thing because it allows
the physical storage to differ. This is the first part to allowing
these features...

* column-based storage where the data for some column(s) lives in a
dedicated heap

* vertical partitioning where defined groups of columns live in
separate heaps for performance and/or security

* generated columns where the column exists only logically and is
derived at run-time (per SQL Standard)

* key/value columns where we retrieve the column value from an hstore

* very large number of columns for statistical data sets where we
automatically vertically partition the heap when faced with large
numbers of column definitions

So when you store the physical representation please also store a
storage method, that currently has just one method SM_HEAP and a
relfilenode.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] sorting table columns

2011-12-21 Thread Alvaro Herrera

Excerpts from Simon Riggs's message of mié dic 21 09:44:04 -0300 2011:

 Sounds great.
 
 While you're doing this, I'd like to think about future requirements,
 to see if that changes anything.
 
 Having a unique logical column id is a great thing because it allows
 the physical storage to differ. This is the first part to allowing
 these features...

Great ideas.  This one I'm not sure about at all:

 * very large number of columns for statistical data sets where we
 automatically vertically partition the heap when faced with large
 numbers of column definitions
 
 So when you store the physical representation please also store a
 storage method, that currently has just one method SM_HEAP and a
 relfilenode.

Well, for the patch I'm working on right now, I'm just going to store an
ID as physical representation, which will mean the sort order used for
the on-disk representation of our current heap storage; the idea here is
to allow columns to be sorted internally by the system so that alignment
padding is reduced; nothing more.  Of course, we can work on more
complex representations later that allow different storage strategies,
such as the ones you propose.

-- 
Álvaro Herrera alvhe...@commandprompt.com
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Extensions and 9.2

2011-12-21 Thread Robert Haas
On Tue, Dec 20, 2011 at 10:01 AM, Dimitri Fontaine
dimi...@2ndquadrant.fr wrote:
 Either I develop them separately, with separate branches derived from
 the master one, or I develop them as a stack, one on top of the other.
 The difference is my ability to provide a patch for one of the features
 that can be applied to master directly compared to how much time I have
 to spend cooking one patch or the other (merge conflicts, etc).

Personally, I hate patches that do more than one thing.  For me, the
time required to verify a patch goes as about O(n^2) in its size.
Furthermore, putting more than one feature into a patch means that it
has to be rejected (or revised by the committer) if any one of those
features looks half-baked.  I can't speak to the preferences of any
other contributor.

  - extension whitelisting

   the goal here is to grant non superuser to install extensions from a
   restricted list, introducing a specific “sudo” like behavior when the
   extension is implemented in C or some other non trusted language.

Who creates this list?

If the answer is the superuser, then why not just let them create a
suitable SECURITY DEFINER function if they are so inclined, wrapping
CREATE EXTENSION?  We've occasionally had requests for DDL
permissions so that you could, for example, grant a given user the
right to ANALYZE a table (but nothing else).  But it's not entirely
clear to me that it's worth doing that.  Assuming the command in
question can be stuffed inside a function, the most you're gaining is
a little notational convenience, and I'm not convinced it's worth
building the amount of infrastructure that this will require for that.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] patch: very simply optimalization of string_agg

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 5:12 AM, Pavel Stehule pavel.steh...@gmail.com wrote:
 This remove a not necessary string to text overhead

Committed, thanks.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Extensions and 9.2

2011-12-21 Thread Dimitri Fontaine
Robert Haas robertmh...@gmail.com writes:
 Personally, I hate patches that do more than one thing.  For me, the
 time required to verify a patch goes as about O(n^2) in its size.

That's exactly why I'm opening that discussion.  The main difference
between the approaches I can take is the time it takes to export each
patch against the merge conflicts to solve at each minor revision.

  - extension whitelisting

 Who creates this list?

 If the answer is the superuser, then why not just let them create a

Yes.

 suitable SECURITY DEFINER function if they are so inclined, wrapping
 CREATE EXTENSION?  We've occasionally had requests for DDL

The goal is that users don't know about the whitelisting in most cases,
they just do CREATE EXTENSION and don't have to care about it, which
means it works the same on the laptop and the production environment.

That's what you easily can get with the command trigger patch.
-- 
Dimitri Fontaine
http://2ndQuadrant.fr PostgreSQL : Expertise, Formation et Support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] deferrable triggers

2011-12-21 Thread Kevin Grittner
Tom Lane t...@sss.pgh.pa.us wrote:
 
 However, it's not a commit time trigger exactly -- keep in mind
 that SET CONSTRAINTS can override the trigger's own timing
 specification.
 
Hmm.  Is there a way for trigger code to check whether it is running
deferred (at transaction commit time) versus immediate?  (If not, I
see writing a small patch in my future to allow it.)
 
-Kevin

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Stephen Frost
* Leonardo Francalanci (m_li...@yahoo.it) wrote:
 I can't help in this discussion, but I have a question:
 how different would this feature be from filesystem-level CRC, such
 as the one available in ZFS and btrfs?

Depends on how much you trust the filesystem. :)

Stephen


signature.asc
Description: Digital signature


Re: [HACKERS] Page Checksums

2011-12-21 Thread Kevin Grittner
Greg Smith g...@2ndquadrant.com wrote:
  Some people think I border on the paranoid on this issue.
 
 Those people are also out to get you, just like the hardware.
 
Hah!  I *knew* it!
 
 Are you arguing that autovacuum should be disabled after crash
 recovery?  I guess if you are arguing that a database VACUUM
 might destroy recoverable data when hardware starts to fail, I
 can't argue.
 
 A CRC failure suggests to me a significantly higher possibility
 of hardware likely to lead to more corruption than a normal crash
 does though.
 
Yeah, the discussion has me coming around to the point of view
advocated by Andres: that it should be treated the same as corrupt
pages detected through other means.  But that can only be done if
you eliminate false positives from hint-bit-only updates.  Without
some way to handle that, I guess that means the idea is dead.
 
Also, I'm not sure that our shop would want to dedicate any space
per page for this, since we're comparing between databases to ensure
that values actually match, row by row, during idle time.  A CRC or
checksum is a lot weaker than that.  I can see where it would be
very valuable where more rigorous methods aren't in use; but it
would really be just extra overhead with little or no benefit for
most of our database clusters.
 
-Kevin

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 12:33 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 Oh btw, I haven't looked at that code recently, but I have a nasty
 feeling that there are parts of it that assume that the number of
 buffers it is managing is fairly small.  Cranking up the number
 might require more work than just changing the value.

Oh, you mean like the fact that it tries to do strict LRU page
replacement?  *rolls eyes*  We seem to have named the SLRU system
after one of its scalability limitations...

I think there probably are some scalability limits to the current
implementation, but also I think we could probably increase the
current value modestly with something less than a total rewrite.
Linearly scanning the slot array won't scale indefinitely, but I think
it will scale to more than 8 elements.  The performance results I
posted previously make it clear that 8 - 32 is a net win at least on
that system.  One fairly low-impact option might be to make the cache
less than fully associative - e.g. given N buffers, a page with pageno
% 4 == X is only allowed to be in a slot numbered between (N/4)*X and
(N/4)*(X+1)-1.  That likely would be counterproductive at N = 8 but
might be OK at larger values.  We could also switch to using a hash
table but that seems awfully heavy-weight.

The real question is how to decide how many buffers to create.  You
suggested a formula based on shared_buffers, but what would that
formula be?  I mean, a typical large system is going to have 1,048,576
shared buffers, and it probably needs less than 0.1% of that amount of
CLOG buffers.  My guess is that there's no real reason to skimp: if
you are really tight for memory, you might want to crank this down,
but otherwise you may as well just go with whatever we decide the
best-performing value is.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 5:17 AM, Simon Riggs si...@2ndquadrant.com wrote:
 With the increased performance we have now, I don't think increasing
 that alone will be that useful since it doesn't solve all of the
 problems and (I am told) likely increases lookup speed.

I have benchmarks showing that it works, for whatever that's worth.

 The full list of clog problems I'm aware of is: raw lookup speed,
 multi-user contention, writes at checkpoint and new xid allocation.

What is the best workload to show a bottleneck on raw lookup speed?

I wouldn't expect writes at checkpoint to be a big problem because
it's so little data.

What's the problem with new XID allocation?

 Would it be better just to have multiple SLRUs dedicated to the clog?
 Simply partition things so we have 2^N sets of everything, and we look
 up the xid in partition (xid % (2^N)).  That would overcome all of the
 problems, not just lookup, in exactly the same way that we partitioned
 the buffer and lock manager. We would use a graduated offset on the
 page to avoid zeroing pages at the same time. Clog size wouldn't
 increase, we'd have the same number of bits, just spread across 2^N
 files. We'd have more pages too, but that's not a bad thing since it
 spreads out the contention.

It seems that would increase memory requirements (clog1 through clog4
with 2 pages each doesn't sound workable).  It would also break
on-disk compatibility for pg_upgrade.  I'm still holding out hope that
we can find a simpler solution...

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Andres Freund
On Wednesday, December 21, 2011 04:21:53 PM Kevin Grittner wrote:
 Greg Smith g...@2ndquadrant.com wrote:
   Some people think I border on the paranoid on this issue.
  
  Those people are also out to get you, just like the hardware.
 
 Hah!  I *knew* it!
 
  Are you arguing that autovacuum should be disabled after crash
  recovery?  I guess if you are arguing that a database VACUUM
  might destroy recoverable data when hardware starts to fail, I
  can't argue.
  
  A CRC failure suggests to me a significantly higher possibility
  of hardware likely to lead to more corruption than a normal crash
  does though.
 
 Yeah, the discussion has me coming around to the point of view
 advocated by Andres: that it should be treated the same as corrupt
 pages detected through other means.  But that can only be done if
 you eliminate false positives from hint-bit-only updates.  Without
 some way to handle that, I guess that means the idea is dead.
 
 Also, I'm not sure that our shop would want to dedicate any space
 per page for this, since we're comparing between databases to ensure
 that values actually match, row by row, during idle time.  A CRC or
 checksum is a lot weaker than that.  I can see where it would be
 very valuable where more rigorous methods aren't in use; but it
 would really be just extra overhead with little or no benefit for
 most of our database clusters.
Comparing between database will by far not recognize failures in all data 
because you surely will not use all indexes. With index only scans the 
likelihood of unnoticed heap corruption also increases.
E.g. I have seen disk level corruption silently corrupting a unique index so 
it didn't cover all data anymore which lead to rather big problems.
Not everyone can do regular dump+restore tests to protect against such 
scenarios...

Andres

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Leonardo Francalanci

On 21/12/2011 16.19, Stephen Frost wrote:

* Leonardo Francalanci (m_li...@yahoo.it) wrote:

I can't help in this discussion, but I have a question:
how different would this feature be from filesystem-level CRC, such
as the one available in ZFS and btrfs?


Depends on how much you trust the filesystem. :)



Ehm I hope that was a joke...


I think what I meant was: isn't this going to be useless in a couple of 
years (if, say, btrfs will be available)? Or it actually gives something 
that FS will never be able to give?


--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Heikki Linnakangas

On 21.12.2011 17:21, Kevin Grittner wrote:

Also, I'm not sure that our shop would want to dedicate any space
per page for this, since we're comparing between databases to ensure
that values actually match, row by row, during idle time.


4 bytes out of a 8k block is just under 0.05%. I don't think anyone is 
going to notice the extra disk space consumed by this. There's all those 
other issues like the hint bits that make this a non-starter, but disk 
space overhead is not one of them.


INHO we should just advise that you should use a filesystem with CRCs if 
you want that extra level of safety. It's the hardware's and operating 
system's job to ensure that data doesn't get corrupt after we hand it 
over to the OS with write()/fsync().


--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Robert Haas
On Tue, Dec 20, 2011 at 12:12 PM, Christopher Browne cbbro...@gmail.com wrote:
 This seems to be a frequent problem with this whole doing CRCs on pages 
 thing.

 It's not evident which problems will be real ones.

That depends on the implementation.  If we have a flaky, broken
implementation such as the one proposed, then, yes, it will be
unclear.  But if we properly guard against a torn page invalidating
the CRC, then it won't be unclear at all: any CRC mismatch means
something bad happened.

Of course, that may be fairly expensive in terms of performance.  But
the only way I can see to get around that problem is to rewrite our
heap AM or our MVCC implementation in some fashion that gets rid of
hint bits.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Stephen Frost
* Leonardo Francalanci (m_li...@yahoo.it) wrote:
 Depends on how much you trust the filesystem. :)
 
 Ehm I hope that was a joke...

It certainly wasn't..

 I think what I meant was: isn't this going to be useless in a couple
 of years (if, say, btrfs will be available)? Or it actually gives
 something that FS will never be able to give?

Yes, it will help you find/address bugs in the filesystem.  These things
are not unheard of...

Thanks,

Stephen


signature.asc
Description: Digital signature


Re: [HACKERS] CLOG contention

2011-12-21 Thread Kevin Grittner
Robert Haas robertmh...@gmail.com wrote:
 
 Any thoughts on what makes most sense here?  I find it fairly
 tempting to just crank up NUM_CLOG_BUFFERS and call it good,
 
The only thought I have to add to discussion so far is that the need
to do anything may be reduced significantly by any work to write
hint bits more aggressively.  We only consult CLOG for tuples on
which hint bits have not yet been set, right?  What if, before
writing a page, we try to set hint bits where we can?  When
successful, it would not only prevent one or more later writes of
the page, but could also prevent having to load old CLOG pages.
Perhaps the hint bit issue should be addressed first, and *then* we
check whether we still have a problem with CLOG.
 
-Kevin

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Allow substitute allocators for PGresult.

2011-12-21 Thread Robert Haas
On Thu, Dec 8, 2011 at 5:41 AM, Kyotaro HORIGUCHI
horiguchi.kyot...@oss.ntt.co.jp wrote:
  This is the patch to add the documentation of PGresult custom
  storage. It shows in section '31.19. Alternative result
  storage'.

It would be good to consolidate this into the main patch.

I find the names of the functions added here to be quite confusing and
would suggest renaming them.  I expected PQgetAsCstring to do
something similar to PQgetvalue, but the code is completely different,
and even after reading the documentation I still don't understand what
that function is supposed to be used for.  Why as cstring?  What
would the other option be?

I also don't think the add tuple terminology is particularly good.
It's not obvious from the name that what you're doing is overriding
the way memory is allocated and results are stored.

Also, what about the problem Tom mentioned here?

http://archives.postgresql.org/message-id/1042.1321123...@sss.pgh.pa.us

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 10:51 AM, Kevin Grittner
kevin.gritt...@wicourts.gov wrote:
 Robert Haas robertmh...@gmail.com wrote:
 Any thoughts on what makes most sense here?  I find it fairly
 tempting to just crank up NUM_CLOG_BUFFERS and call it good,

 The only thought I have to add to discussion so far is that the need
 to do anything may be reduced significantly by any work to write
 hint bits more aggressively.  We only consult CLOG for tuples on
 which hint bits have not yet been set, right?  What if, before
 writing a page, we try to set hint bits where we can?  When
 successful, it would not only prevent one or more later writes of
 the page, but could also prevent having to load old CLOG pages.
 Perhaps the hint bit issue should be addressed first, and *then* we
 check whether we still have a problem with CLOG.

There may be workloads where that will help, but it's definitely not
going to cover all cases.  Consider my trusty
pgbench-at-scale-factor-100 test case: since the working set fits
inside shared buffers, we're only writing pages at checkpoint time.
The contention happens because we randomly select rows from the table,
and whatever row we select hasn't been examined since it was last
updated, and so it's unhinted.  But we're not reading the page in:
it's already in shared buffers, and has never been written out.  I
don't see any realistic way to avoid the CLOG lookups in that case:
nobody else has had any reason to touch that page in any way since the
tuple was first written.

So I think we need a more general solution.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] [PATCH] Fix float8 parsing of denormal values (on some platforms?)

2011-12-21 Thread Marti Raudsepp
Hi list,

Back in June we had a discussion about parsing denormal floating-point
values. A float8-text conversion could result in a number that can't
be converted back to float8 anymore for some values. Among other
things, this could break backups (though my searches didn't turn up
any reports of this ever happening).

The reason is that Linux strtod() sets errno=ERANGE for denormal
numbers. This behavior is also explicitly allowed
(implementation-defined) by the C99 standard.

Further analysis was done by Jeroen Vermeulen here:
http://archives.postgresql.org/pgsql-hackers/2011-06/msg01773.php

I think the least invasive fix, as proposed by Jeroen, is to fail only
when ERANGE is set *and* the return value is 0.0 or +/-HUGE_VAL.
Reading relevant specifications, this seems to be a fairly safe
assumption. That's what the attached patch does.

(I also updated the float4in function, but that's not strictly
necessary -- it would fail later in CHECKFLOATVAL() anyway)

What I don't know is how many platforms are actually capable of
parsing denormal double values. I added some regression tests, it
would be interesting to see results from pgbuildfarm and potentially
revert these tests later.

Regards,
Marti

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Alvaro Herrera

Excerpts from Robert Haas's message of mié dic 21 13:18:36 -0300 2011:

 There may be workloads where that will help, but it's definitely not
 going to cover all cases.  Consider my trusty
 pgbench-at-scale-factor-100 test case: since the working set fits
 inside shared buffers, we're only writing pages at checkpoint time.
 The contention happens because we randomly select rows from the table,
 and whatever row we select hasn't been examined since it was last
 updated, and so it's unhinted.  But we're not reading the page in:
 it's already in shared buffers, and has never been written out.  I
 don't see any realistic way to avoid the CLOG lookups in that case:
 nobody else has had any reason to touch that page in any way since the
 tuple was first written.

Maybe we need a background tuple hinter process ...

-- 
Álvaro Herrera alvhe...@commandprompt.com
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [PATCH] Fix float8 parsing of denormal values (on some platforms?)

2011-12-21 Thread Marti Raudsepp
On Wed, Dec 21, 2011 at 18:21, Marti Raudsepp ma...@juffo.org wrote:
 I think the least invasive fix, as proposed by Jeroen, is to fail only
 when ERANGE is set *and* the return value is 0.0 or +/-HUGE_VAL.
 Reading relevant specifications, this seems to be a fairly safe
 assumption. That's what the attached patch does.

Oops, now attached the patch too.

Regards,
Marti
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
new file mode 100644
index 63b09a4..86e1661
*** a/src/backend/utils/adt/float.c
--- b/src/backend/utils/adt/float.c
*** float4in(PG_FUNCTION_ARGS)
*** 238,247 
  			endptr = num + 9;
  		}
  		else if (errno == ERANGE)
! 			ereport(ERROR,
! 	(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
! 	 errmsg(\%s\ is out of range for type real,
! 			orig_num)));
  		else
  			ereport(ERROR,
  	(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
--- 238,257 
  			endptr = num + 9;
  		}
  		else if (errno == ERANGE)
! 		{
! 			/*
! 			 * We only fail for ERANGE if the return value is also out of
! 			 * range. Some platforms parse and return denormal values
! 			 * correctly, but still set errno to ERANGE.
! 			 */
! 			if (val == 0.0 || val == HUGE_VAL || val == -HUGE_VAL)
! 			{
! ereport(ERROR,
! 		(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
! 		 errmsg(\%s\ is out of range for type real,
! orig_num)));
! 			}
! 		}
  		else
  			ereport(ERROR,
  	(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
*** float8in(PG_FUNCTION_ARGS)
*** 431,440 
  			endptr = num + 9;
  		}
  		else if (errno == ERANGE)
! 			ereport(ERROR,
! 	(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
!    errmsg(\%s\ is out of range for type double precision,
! 		  orig_num)));
  		else
  			ereport(ERROR,
  	(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
--- 441,460 
  			endptr = num + 9;
  		}
  		else if (errno == ERANGE)
! 		{
! 			/*
! 			 * We only fail for ERANGE if the return value is also out of
! 			 * range. Some platforms parse and return denormal values
! 			 * correctly, but still set errno to ERANGE.
! 			 */
! 			if (val == 0.0 || val == HUGE_VAL || val == -HUGE_VAL)
! 			{
! ereport(ERROR,
! 		(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
! 	   errmsg(\%s\ is out of range for type double precision,
! 			  orig_num)));
! 			}
! 		}
  		else
  			ereport(ERROR,
  	(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
diff --git a/src/test/regress/expected/float8.out b/src/test/regress/expected/float8.out
new file mode 100644
index 6221538..e3fb8d3
*** a/src/test/regress/expected/float8.out
--- b/src/test/regress/expected/float8.out
*** SELECT '-10e-400'::float8;
*** 24,29 
--- 24,48 
  ERROR:  -10e-400 is out of range for type double precision
  LINE 1: SELECT '-10e-400'::float8;
 ^
+ -- test denormal value parsing
+ SELECT '4.95e-324'::float8  '1.49e-323'::float8;
+  ?column? 
+ --
+  t
+ (1 row)
+ 
+ SELECT '4.95e-324'::float8  '0'::float8;
+  ?column? 
+ --
+  t
+ (1 row)
+ 
+ SELECT substr('-4.95e-324'::float8::text, 1, 2);
+  substr 
+ 
+  -4
+ (1 row)
+ 
  -- bad input
  INSERT INTO FLOAT8_TBL(f1) VALUES ('');
  ERROR:  invalid input syntax for type double precision: 
diff --git a/src/test/regress/sql/float8.sql b/src/test/regress/sql/float8.sql
new file mode 100644
index 92a574a..e13eb51
*** a/src/test/regress/sql/float8.sql
--- b/src/test/regress/sql/float8.sql
*** SELECT '-10e400'::float8;
*** 16,21 
--- 16,26 
  SELECT '10e-400'::float8;
  SELECT '-10e-400'::float8;
  
+ -- test denormal value parsing
+ SELECT '4.95e-324'::float8  '1.49e-323'::float8;
+ SELECT '4.95e-324'::float8  '0'::float8;
+ SELECT substr('-4.95e-324'::float8::text, 1, 2);
+ 
  -- bad input
  INSERT INTO FLOAT8_TBL(f1) VALUES ('');
  INSERT INTO FLOAT8_TBL(f1) VALUES (' ');

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Leonardo Francalanci

I think what I meant was: isn't this going to be useless in a couple
of years (if, say, btrfs will be available)? Or it actually gives
something that FS will never be able to give?


Yes, it will help you find/address bugs in the filesystem.  These things
are not unheard of...


It sounds to me like a huge job to fix some issues not unheard of...

My point is: if we are trying to fix misbehaving drives/controllers 
(something that is more common than one might think), that's already 
done by ZFS on Solaris and FreeBSD, and will be done in btrfs for linux.


I understand not trusting drives/controllers; but not trusting a 
filesystem...



What am I missing? (I'm far from being an expert... I just don't 
understand...)






--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Tom Lane
Heikki Linnakangas heikki.linnakan...@enterprisedb.com writes:
 4 bytes out of a 8k block is just under 0.05%. I don't think anyone is 
 going to notice the extra disk space consumed by this. There's all those 
 other issues like the hint bits that make this a non-starter, but disk 
 space overhead is not one of them.

The bigger problem is that adding a CRC necessarily changes the page
format and therefore breaks pg_upgrade.  As Greg and Simon already
pointed out upthread, there's essentially zero chance of this getting
applied before we have a solution that allows pg_upgrade to cope with
page format changes.  A CRC feature is not compelling enough to justify
a non-upgradable release cycle.

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Cursor behavior

2011-12-21 Thread Robert Haas
On Thu, Dec 15, 2011 at 4:15 PM, amit sehas cu...@yahoo.com wrote:
 I had a question about the cursor internals implementation. When you Fetch 
 next 'n' results without moving the cursors, is this kind of functionality 
 implemented by firstly executing the whole query and then moving the cursor 
 over the results, or are the movements done on active database lookups, 
 moving forward and backward...

I think it depends on the query.  For example, I believe that a query
involving writeable CTEs will be run to completion before returning
any results, but I believe that a seqscan will not.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Cursor behavior

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 11:44 AM, amit sehas cu...@yahoo.com wrote:
 It seems that the task of fetching next n results without moving the cursor 
 seems like too complicated to implement for any query that has
 even a little bit of complication in it...

I think that's probably true.  It would also be expensive if you did
implement it; who wants to go re-execute a join to back up the scan?

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Tom Lane
Robert Haas robertmh...@gmail.com writes:
 I think there probably are some scalability limits to the current
 implementation, but also I think we could probably increase the
 current value modestly with something less than a total rewrite.
 Linearly scanning the slot array won't scale indefinitely, but I think
 it will scale to more than 8 elements.  The performance results I
 posted previously make it clear that 8 - 32 is a net win at least on
 that system.

Agreed, the question is whether 32 is enough to fix the problem for
anything except this one benchmark.

 One fairly low-impact option might be to make the cache
 less than fully associative - e.g. given N buffers, a page with pageno
 % 4 == X is only allowed to be in a slot numbered between (N/4)*X and
 (N/4)*(X+1)-1.  That likely would be counterproductive at N = 8 but
 might be OK at larger values.

I'm inclined to think that that specific arrangement wouldn't be good.
The normal access pattern for CLOG is, I believe, an exponentially
decaying probability-of-access for each page as you go further back from
current.  We have a hack to pin the current (latest) page into SLRU all
the time, but you want the design to be such that the next-to-latest
page is most likely to still be around, then the second-latest, etc.

If I'm reading your equation correctly then the most recent pages would
compete against each other, not against much older pages, which is
exactly the wrong thing.  Perhaps what you actually meant to say was
that all pages with the same number mod 4 are in one bucket, which would
be better, but still not really ideal: for instance the next-to-latest
page could end up getting removed while say the third-latest page is
still there because it's in a different associative bucket that's under
less pressure.

But possibly we could fix that with some other variant of the idea.
I certainly agree that strict LRU isn't an essential property here,
so long as we have a design that is matched to the expected access
pattern statistics.

 We could also switch to using a hash
 table but that seems awfully heavy-weight.

Yeah.  If we're not going to go to hundreds of CLOG buffers, which
I think probably wouldn't be useful, then hashing is unlikely to be the
best answer.

 The real question is how to decide how many buffers to create.  You
 suggested a formula based on shared_buffers, but what would that
 formula be?  I mean, a typical large system is going to have 1,048,576
 shared buffers, and it probably needs less than 0.1% of that amount of
 CLOG buffers.

Well, something like 0.1% with minimum of 8 and max of 32 might be
reasonable.  What I'm mainly fuzzy about is the upper limit.

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 3:28 PM, Robert Haas robertmh...@gmail.com wrote:
 On Wed, Dec 21, 2011 at 5:17 AM, Simon Riggs si...@2ndquadrant.com wrote:
 With the increased performance we have now, I don't think increasing
 that alone will be that useful since it doesn't solve all of the
 problems and (I am told) likely increases lookup speed.

 I have benchmarks showing that it works, for whatever that's worth.

 The full list of clog problems I'm aware of is: raw lookup speed,
 multi-user contention, writes at checkpoint and new xid allocation.

 What is the best workload to show a bottleneck on raw lookup speed?

A microbenchmark.

 I wouldn't expect writes at checkpoint to be a big problem because
 it's so little data.

 What's the problem with new XID allocation?

Earlier experience shows that those are areas of concern. You aren't
measuring response time in your tests, so you won't notice them as
problems. But they do effect throughput much more than intuition says
it would.


 Would it be better just to have multiple SLRUs dedicated to the clog?
 Simply partition things so we have 2^N sets of everything, and we look
 up the xid in partition (xid % (2^N)).  That would overcome all of the
 problems, not just lookup, in exactly the same way that we partitioned
 the buffer and lock manager. We would use a graduated offset on the
 page to avoid zeroing pages at the same time. Clog size wouldn't
 increase, we'd have the same number of bits, just spread across 2^N
 files. We'd have more pages too, but that's not a bad thing since it
 spreads out the contention.

 It seems that would increase memory requirements (clog1 through clog4
 with 2 pages each doesn't sound workable).  It would also break
 on-disk compatibility for pg_upgrade.  I'm still holding out hope that
 we can find a simpler solution...

Not sure what you mean by increase memory requirements. How would
increasing NUM_CLOG_BUFFERS = 64 differ from having NUM_CLOG_BUFFERS =
8 and NUM_CLOG_PARTITIONS = 8?

I think you appreciate that having 8 lwlocks rather than 1 might help
scalability.

I'm sure pg_upgrade can be tweaked easily enough and it would still
work quickly.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 11:48 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 Agreed, the question is whether 32 is enough to fix the problem for
 anything except this one benchmark.

Right.  My thought on that topic is that it depends on what you mean
by fix.  It's clearly NOT possible to keep enough CLOG buffers
around to cover the entire range of XID space that might get probed,
at least not without some massive rethinking of the infrastructure.
It seems that the amount of space that might need to be covered there
is at least on the order of vacuum_freeze_table_age, which is to say
150 million by default.  At 32K txns/page, that would require almost
5K pages, which is a lot more than 8.

On the other hand, if we just want to avoid having more requests
simultaneously in flight than we have buffers, so that backends don't
need to wait for an available buffer before beginning their I/O, then
something on the order of the number of CPUs in the machine is likely
sufficient.  I'll do a little more testing and see if I can figure out
where the tipping point is on this 32-core box.

 One fairly low-impact option might be to make the cache
 less than fully associative - e.g. given N buffers, a page with pageno
 % 4 == X is only allowed to be in a slot numbered between (N/4)*X and
 (N/4)*(X+1)-1.  That likely would be counterproductive at N = 8 but
 might be OK at larger values.

 I'm inclined to think that that specific arrangement wouldn't be good.
 The normal access pattern for CLOG is, I believe, an exponentially
 decaying probability-of-access for each page as you go further back from
 current.  We have a hack to pin the current (latest) page into SLRU all
 the time, but you want the design to be such that the next-to-latest
 page is most likely to still be around, then the second-latest, etc.

 If I'm reading your equation correctly then the most recent pages would
 compete against each other, not against much older pages, which is
 exactly the wrong thing.  Perhaps what you actually meant to say was
 that all pages with the same number mod 4 are in one bucket, which would
 be better,

That's what I meant.  I think the formula works out to that, but in
any case it's what I meant.  :-)

  but still not really ideal: for instance the next-to-latest
 page could end up getting removed while say the third-latest page is
 still there because it's in a different associative bucket that's under
 less pressure.

Well, sure.  But who is to say that's bad?  I think you can find a way
to throw stones at any given algorithm we might choose to implement.
For example, if you contrive things so that you repeatedly access the
same old CLOG pages cyclically: 1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8,...

...then our existing LRU algorithm will be anti-optimal, because we'll
keep the latest page plus the most recently accessed 7 old pages in
memory, and every lookup will fault out the page that the next lookup
is about to need.  If you're not that excited about that happening in
real life, neither am I.  But neither am I that excited about your
scenario: if the next-to-last page gets kicked out, there are a whole
bunch of pages -- maybe 8, if you imagine 32 buffers split 4 ways --
that have been accessed more recently than that next-to-last page.  So
it wouldn't be resident in an 8-buffer pool either.  Maybe the last
page was mostly transactions updating some infrequently-accessed
table, and we don't really need that page right now.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Tom Lane
Robert Haas robertmh...@gmail.com writes:
 On Wed, Dec 21, 2011 at 11:48 AM, Tom Lane t...@sss.pgh.pa.us wrote:
 I'm inclined to think that that specific arrangement wouldn't be good.
 The normal access pattern for CLOG is, I believe, an exponentially
 decaying probability-of-access for each page as you go further back from
 current. ... for instance the next-to-latest
 page could end up getting removed while say the third-latest page is
 still there because it's in a different associative bucket that's under
 less pressure.

 Well, sure.  But who is to say that's bad?  I think you can find a way
 to throw stones at any given algorithm we might choose to implement.

The point I'm trying to make is that buffer management schemes like
that one are built on the assumption that the probability of access is
roughly uniform for all pages.  We know (or at least have strong reason
to presume) that CLOG pages have very non-uniform probability of access.
The straight LRU scheme is good because it deals well with non-uniform
access patterns.  Dividing the buffers into independent buckets in a way
that doesn't account for the expected access probabilities is going to
degrade things.  (The approach Simon suggests nearby seems isomorphic to
yours and so suffers from this same objection, btw.)

 For example, if you contrive things so that you repeatedly access the
 same old CLOG pages cyclically: 1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8,...

Sure, and the reason that that's contrived is that it flies in the face
of reasonable assumptions about CLOG access probabilities.  Any scheme
will lose some of the time, but you don't want to pick a scheme that is
more likely to lose for more probable access patterns.

It strikes me that one simple thing we could do is extend the current
heuristic that says pin the latest page.  That is, pin the last K
pages into SLRU, and apply LRU or some other method across the rest.
If K is large enough, that should get us down to where the differential
in access probability among the older pages is small enough to neglect,
and then we could apply associative bucketing or other methods to the
rest without fear of getting burnt by the common usage pattern.  I don't
know what K would need to be, though.  Maybe it's worth instrumenting
a benchmark run or two so we can get some facts rather than guesses
about the access frequencies?

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] bghinter process

2011-12-21 Thread Kevin Grittner
Would it make sense, as suggested by Álvaro in the CLOG contention
thread, to have a background process to set hint bits on tuples in
dirty pages?  Processing could be loosely based around the
background writer techniques in terms of sweeping through the cache,
but it would only look at dirty pages (destined to be written
anyway) and I think we might want to further limit it to looking at
tuples with an xmin or xmax value which precede the global xmin
value and doesn't yet have a hint.  It wouldn't do any writing; it
would just check visibility and set hint bits.
 
-Kevin

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 1:09 PM, Tom Lane t...@sss.pgh.pa.us wrote:
 It strikes me that one simple thing we could do is extend the current
 heuristic that says pin the latest page.  That is, pin the last K
 pages into SLRU, and apply LRU or some other method across the rest.
 If K is large enough, that should get us down to where the differential
 in access probability among the older pages is small enough to neglect,
 and then we could apply associative bucketing or other methods to the
 rest without fear of getting burnt by the common usage pattern.  I don't
 know what K would need to be, though.  Maybe it's worth instrumenting
 a benchmark run or two so we can get some facts rather than guesses
 about the access frequencies?

I guess the point is that it seems to me to depend rather heavily on
what benchmark you run.  For something like pgbench, we initialize the
cluster with one or a few big transactions, so the page containing
those XIDs figures to stay hot for a very long time.  Then after that
we choose rows to update randomly, which will produce the sort of
newer-pages-are-hotter-than-older-pages effect that you're talking
about.  But the slope of the curve depends heavily on the scale
factor.  If we have scale factor 1 (= 100,000 rows) then chances are
that when we randomly pick a row to update, we'll hit one that's been
touched within the last few hundred thousand updates - i.e. the last
couple of CLOG pages.  But if we have scale factor 100 (= 10,000,000
rows) we might easily hit a row that hasn't been updated for many
millions of transactions, so there's going to be a much longer tail
there.  And some other test could yield very different results - e.g.
something that uses lots of subtransactions might well have a much
longer tail, while something that does more than one update per
transaction would presumably have a shorter one.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] bghinter process

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 1:14 PM, Kevin Grittner
kevin.gritt...@wicourts.gov wrote:
 Would it make sense, as suggested by Álvaro in the CLOG contention
 thread, to have a background process to set hint bits on tuples in
 dirty pages?  Processing could be loosely based around the
 background writer techniques in terms of sweeping through the cache,
 but it would only look at dirty pages (destined to be written
 anyway) and I think we might want to further limit it to looking at
 tuples with an xmin or xmax value which precede the global xmin
 value and doesn't yet have a hint.  It wouldn't do any writing; it
 would just check visibility and set hint bits.

Maybe.  But I think we'd need to see some test results showing that it
helps.  I mean, the nice thing about our current system is that we
don't set hint bits on tuples unless we otherwise have some need to
look at them.  Something like this COULD end up chewing up CPU time
and memory bandwidth without actually improving performance.  On a
high-velocity system we could dirty the same buffers multiple times in
the course of a second, so a background process that scans through the
buffer pool say, once per minute would effectively be standing still.

Now, that's not to say there aren't case where it would help.  I just
don't know exactly what they are.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] sorting table columns

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 1:42 PM, Alvaro Herrera
alvhe...@commandprompt.com wrote:

 This one I'm not sure about at all:

 * very large number of columns for statistical data sets where we
 automatically vertically partition the heap when faced with large
 numbers of column definitions

We currently have pg_attribute.attnum as an int2, so we can store up
to 32768 columns without changing that size, as long as we have some
place to put the data.

Was there something you're working on likely to preventing 240 cols?
Just worth documenting what you see at this stage.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] bghinter process

2011-12-21 Thread Alvaro Herrera

Excerpts from Robert Haas's message of mié dic 21 15:47:13 -0300 2011:

 Maybe.  But I think we'd need to see some test results showing that it
 helps.  I mean, the nice thing about our current system is that we
 don't set hint bits on tuples unless we otherwise have some need to
 look at them.

The bad thing about that is you have client-connected processes doing
CLOG lookups to figure out the hint values, which adds latency to them.
I guess we're not concerned very much about the latency of this
bghinter -- the only thing it'd do is add some pressure to the clog
LRU; so maybe one thing to keep in mind is that bghinter should add its
clog pages near the tail of the LRU queue, not the head, so that it
doesn't evict pages that are being used by client-connected backends.

This bghinter could keep a cache of committed/aborted transactions of
its own, further reducing the need to look up clog pages.

 Something like this COULD end up chewing up CPU time
 and memory bandwidth without actually improving performance.  On a
 high-velocity system we could dirty the same buffers multiple times in
 the course of a second, so a background process that scans through the
 buffer pool say, once per minute would effectively be standing still.

But, well, tuples that are succesfully hinted need no more hint bits.

-- 
Álvaro Herrera alvhe...@commandprompt.com
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] sorting table columns

2011-12-21 Thread Alvaro Herrera

Excerpts from Simon Riggs's message of mié dic 21 15:53:20 -0300 2011:
 On Wed, Dec 21, 2011 at 1:42 PM, Alvaro Herrera
 alvhe...@commandprompt.com wrote:
 
  This one I'm not sure about at all:
 
  * very large number of columns for statistical data sets where we
  automatically vertically partition the heap when faced with large
  numbers of column definitions
 
 We currently have pg_attribute.attnum as an int2, so we can store up
 to 32768 columns without changing that size, as long as we have some
 place to put the data.

Hm, right.

 Was there something you're working on likely to preventing 240 cols?

No, not at all.

 Just worth documenting what you see at this stage.

I'll keep my eyes open :-)

-- 
Álvaro Herrera alvhe...@commandprompt.com
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 3:24 PM, Robert Haas robertmh...@gmail.com wrote:

 I think there probably are some scalability limits to the current
 implementation, but also I think we could probably increase the
 current value modestly with something less than a total rewrite.
 Linearly scanning the slot array won't scale indefinitely, but I think
 it will scale to more than 8 elements.  The performance results I
 posted previously make it clear that 8 - 32 is a net win at least on
 that system.

Agreed to that, but I don't think its nearly enough.

 One fairly low-impact option might be to make the cache
 less than fully associative - e.g. given N buffers, a page with pageno
 % 4 == X is only allowed to be in a slot numbered between (N/4)*X and
 (N/4)*(X+1)-1.  That likely would be counterproductive at N = 8 but
 might be OK at larger values.

Which is pretty much the same as saying, yes, lets partition the clog
as I suggested, but by a different route.

 We could also switch to using a hash
 table but that seems awfully heavy-weight.

Which is a re-write of SLRU ground up and inapproriate for most SLRU
usage. We'd get partitioning for free as long as we re-write.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 2:05 PM, Simon Riggs si...@2ndquadrant.com wrote:
 On Wed, Dec 21, 2011 at 3:24 PM, Robert Haas robertmh...@gmail.com wrote:
 I think there probably are some scalability limits to the current
 implementation, but also I think we could probably increase the
 current value modestly with something less than a total rewrite.
 Linearly scanning the slot array won't scale indefinitely, but I think
 it will scale to more than 8 elements.  The performance results I
 posted previously make it clear that 8 - 32 is a net win at least on
 that system.

 Agreed to that, but I don't think its nearly enough.

 One fairly low-impact option might be to make the cache
 less than fully associative - e.g. given N buffers, a page with pageno
 % 4 == X is only allowed to be in a slot numbered between (N/4)*X and
 (N/4)*(X+1)-1.  That likely would be counterproductive at N = 8 but
 might be OK at larger values.

 Which is pretty much the same as saying, yes, lets partition the clog
 as I suggested, but by a different route.

 We could also switch to using a hash
 table but that seems awfully heavy-weight.

 Which is a re-write of SLRU ground up and inapproriate for most SLRU
 usage. We'd get partitioning for free as long as we re-write.

I'm not sure what your point is here.  I feel like this is on the edge
of turning into an argument, and if we're going to have an argument
I'd like to know what we're arguing about.  I am not arguing that
under no circumstances should we partition anything related to CLOG,
nor am I trying to deny you credit for your ideas.  I'm merely saying
that the specific plan of having multiple SLRUs for CLOG doesn't
appeal to me -- mostly because I think it will make life difficult for
pg_upgrade without any compensating advantage.  If we're going to go
that route, I'd rather build something into the SLRU machinery
generally that allows for the cache to be less than fully-associative,
with all of the savings in terms of lock contention that this entails.
 Such a system could be used by any SLRU, not just CLOG, if it proved
to be helpful; and it would avoid any on-disk changes, with, as far as
I can see, basically no downside.

That having been said, Tom isn't convinced that any form of
partitioning is the right way to go, and since Tom often has good
ideas, I'd like to explore his notions of how we might fix this
problem other than via some form of partitioning before we focus in on
partitioning.  Partitioning may ultimately be the right way to go, but
let's keep an open mind: this thread is only 14 hours old.  The only
things I'm completely convinced of at this point are (1) we need more
CLOG buffers (but I don't know exactly how many) and (2) the current
code isn't designed to manage large numbers of buffers (but I don't
know exactly where it starts to fall over).

If I'm completely misunderstanding the point of your email, please set
me straight (gently).

Thanks,

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Greg Smith

On 12/21/2011 10:49 AM, Stephen Frost wrote:

* Leonardo Francalanci (m_li...@yahoo.it) wrote:
   

I think what I meant was: isn't this going to be useless in a couple
of years (if, say, btrfs will be available)? Or it actually gives
something that FS will never be able to give?
 

Yes, it will help you find/address bugs in the filesystem.  These things
are not unheard of...
   


There was a spike in data recovery business here after people started 
migrating to ext4.  New filesystems are no fun to roll out; some bugs 
will only get shaken out when brave early adopters deploy them.


And there's even more radical changes in btrfs, since it wasn't starting 
with a fairly robust filesystem as a base.  And putting my tin foil hat 
on, I don't feel real happy about assuming *the* solution for this issue 
in PostgreSQL is the possibility of a filesystem coming one day when 
that work is being steered by engineers who work at Oracle.


--
Greg Smith   2ndQuadrant USg...@2ndquadrant.com   Baltimore, MD
PostgreSQL Training, Services, and 24x7 Support  www.2ndQuadrant.us


--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 12:48 PM, Robert Haas robertmh...@gmail.com wrote:
 On the other hand, if we just want to avoid having more requests
 simultaneously in flight than we have buffers, so that backends don't
 need to wait for an available buffer before beginning their I/O, then
 something on the order of the number of CPUs in the machine is likely
 sufficient.  I'll do a little more testing and see if I can figure out
 where the tipping point is on this 32-core box.

I recompiled with NUM_CLOG_BUFFERS = 8, 16, 24, 32, 40, 48 and ran
5-minute tests, using unlogged tables to avoid getting killed by
WALInsertLock contentions.  With 32-clients on this 32-core box, the
tipping point is somewhere in the neighborhood of 32 buffers.  40
buffers might still be winning over 32, or maybe not, but 48 is
definitely losing.  Below 32, more is better, all the way up.  Here
are the full results:

resultswu.clog16.32.100.300:tps = 19549.454462 (including connections
establishing)
resultswu.clog16.32.100.300:tps = 19883.583245 (including connections
establishing)
resultswu.clog16.32.100.300:tps = 19984.857186 (including connections
establishing)
resultswu.clog24.32.100.300:tps = 20124.147651 (including connections
establishing)
resultswu.clog24.32.100.300:tps = 20108.504407 (including connections
establishing)
resultswu.clog24.32.100.300:tps = 20303.964120 (including connections
establishing)
resultswu.clog32.32.100.300:tps = 20573.873097 (including connections
establishing)
resultswu.clog32.32.100.300:tps = 20444.289259 (including connections
establishing)
resultswu.clog32.32.100.300:tps = 20234.209965 (including connections
establishing)
resultswu.clog40.32.100.300:tps = 21762.222195 (including connections
establishing)
resultswu.clog40.32.100.300:tps = 20621.749677 (including connections
establishing)
resultswu.clog40.32.100.300:tps = 20290.990673 (including connections
establishing)
resultswu.clog48.32.100.300:tps = 19253.424997 (including connections
establishing)
resultswu.clog48.32.100.300:tps = 19542.095191 (including connections
establishing)
resultswu.clog48.32.100.300:tps = 19284.962036 (including connections
establishing)
resultswu.master.32.100.300:tps = 18694.886622 (including connections
establishing)
resultswu.master.32.100.300:tps = 18417.647703 (including connections
establishing)
resultswu.master.32.100.300:tps = 18331.718955 (including connections
establishing)


Parameters in use: shared_buffers = 8GB, maintenance_work_mem = 1GB,
synchronous_commit = off, checkpoint_segments = 300,
checkpoint_timeout = 15min, checkpoint_completion_target = 0.9,
wal_writer_delay = 20ms

It isn't clear to me whether we can extrapolate anything more general
from this.  It'd be awfully interesting to repeat this experiment on,
say, an 8-core server, but I don't have one of those I can use at the
moment.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] RangeVarGetRelid()

2011-12-21 Thread Robert Haas
On Tue, Dec 20, 2011 at 8:14 PM, Noah Misch n...@leadboat.com wrote:
 I also notice that cluster() - which doesn't have a callback - has
 exactly the same needs as ReindexRelation() - which does.  So that
 case can certainly share code; though I'm not quite sure what to call
 the shared callback, or which file to put it in.
 RangeVarCallbackForStorageRewrite?

 I'd put it in tablecmds.c and name it RangeVarCallbackOwnsTable.

OK.

 RangeVarCallbackForAlterRelation() does not preserve ALTER TABLE's refusal to
 operate on foreign tables.

I should probably fix that, but I'm wondering if I ought to fix it by
disallowing the use of ALTER TABLE on FOREIGN TABLEs for the other
commands that share the callback as well.  Allowing ALTER TABLE to
apply to any relation type is mostly a legacy thing, I think, and any
code that's new enough to know about foreign tables isn't old enough
to know about the time when you had to use ALTER TABLE to rename
views.

 RangeVarCallbackForAlterRelation() does not preserve the check for unexpected
 object types.

I don't feel a strong need to retain that.

 utility.c doesn't take locks for any other command; parse analysis usually
 does that.  To preserve that modularity, you could add a bool toplevel
 argument to transformAlterTableStmt().  Pass true here, false in
 ATPostAlterTypeParse().  If true, use AlterTableLookupRelation() to get full
 security checks.  Otherwise, just call relation_openrv() as now.  Would that
 be an improvement?

Not sure.  I feel that it's unwise to pass relation names all over the
backend and assume that nothing will change meanwhile; no locking we
do will prevent that, at least in the case of search path
interposition.  Ultimately I think this ought to be restructured
somehow so that we look up each name ONCE and ever-after refer only to
the resulting OID (except for error message text).  But I'm not sure
how to do that, and thought it might make sense to commit this much
independently of such a refactoring.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] bghinter process

2011-12-21 Thread Aidan Van Dyk
On Wed, Dec 21, 2011 at 1:59 PM, Alvaro Herrera
alvhe...@commandprompt.com wrote:

 But, well, tuples that are succesfully hinted need no more hint bits.

Not only do they need no more hinting, they also allow the next
client-serving process that hits it avoid the clog lookup to determine
the hint.

a.

-- 
Aidan Van Dyk                                             Create like a god,
ai...@highrise.ca                                       command like a king,
http://www.highrise.ca/                                   work like a slave.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] Page Checksums + Double Writes

2011-12-21 Thread David Fetter
Folks,

One of the things VMware is working on is double writes, per previous
discussions of how, for example, InnoDB does things.   I'd initially
thought that introducing just one of the features in $Subject at a
time would help, but I'm starting to see a mutual dependency.

The issue is that double writes needs a checksum to work by itself,
and page checksums more broadly work better when there are double
writes, obviating the need to have full_page_writes on.

If submitting these things together seems like a better idea than
having them arrive separately, I'll work with my team here to make
that happen soonest.

There's a separate issue we'd like to get clear on, which is whether
it would be OK to make a new PG_PAGE_LAYOUT_VERSION.

If so, there's less to do, but pg_upgrade as it currently stands is
broken.

If not, we'll have to do some extra work on the patch as described
below.  Thanks to Kevin Grittner for coming up with this :)

- Use a header bit to say whether we've got a checksum on the page.
  We're using 3/16 of the available bits as described in
  src/include/storage/bufpage.h.

- When that bit is set, place the checksum somewhere convenient on the
  page.  One way to do this would be to have an optional field at the
  end of the special space based on the new bit.  Rows from pg_upgrade
  would have the bit clear, and would have the shorter special
  structure without the checksum.

Cheers,
David.
-- 
David Fetter da...@fetter.org http://fetter.org/
Phone: +1 415 235 3778  AIM: dfetter666  Yahoo!: dfetter
Skype: davidfetter  XMPP: david.fet...@gmail.com
iCal: webcal://www.tripit.com/feed/ical/people/david74/tripit.ics

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Alvaro Herrera

Excerpts from David Fetter's message of mié dic 21 18:59:13 -0300 2011:

 If not, we'll have to do some extra work on the patch as described
 below.  Thanks to Kevin Grittner for coming up with this :)
 
 - Use a header bit to say whether we've got a checksum on the page.
   We're using 3/16 of the available bits as described in
   src/include/storage/bufpage.h.
 
 - When that bit is set, place the checksum somewhere convenient on the
   page.  One way to do this would be to have an optional field at the
   end of the special space based on the new bit.  Rows from pg_upgrade
   would have the bit clear, and would have the shorter special
   structure without the checksum.

If you get away with a new page format, let's make sure and coordinate
so that we can add more info into the header.  One thing I wanted was to
have an ID struct on each file, so that you know what
DB/relation/segment the file corresponds to.  So the first page's
special space would be a bit larger than the others.

-- 
Álvaro Herrera alvhe...@commandprompt.com
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Kevin Grittner
Alvaro Herrera alvhe...@commandprompt.com wrote:
 
 If you get away with a new page format, let's make sure and
 coordinate so that we can add more info into the header.  One
 thing I wanted was to have an ID struct on each file, so that you
 know what DB/relation/segment the file corresponds to.  So the
 first page's special space would be a bit larger than the others.
 
Couldn't that also be done by burning a bit in the page header
flags, without a page layout version bump?  If that were done, you
wouldn't have the additional information on tables converted by
pg_upgrade, but you would get them on new tables, including those
created by pg_dump/psql conversions.  Adding them could even be made
conditional, although I don't know whether that's a good idea
 
-Kevin

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Martijn van Oosterhout
On Wed, Dec 21, 2011 at 09:32:28AM +0100, Leonardo Francalanci wrote:
 I can't help in this discussion, but I have a question:
 how different would this feature be from filesystem-level CRC, such
 as the one available in ZFS and btrfs?

Hmm, filesystems are not magical. If they implement this then they will
have the same issues with torn pages as Postgres would.  Which I
imagine they solve by doing a transactional update by writing the new
page to a new location, with checksum and updating a pointer.  They
can't even put the checksum on the same page, like we could.  How that
interacts with seqscans I have no idea.

Certainly I think we could look to them for implementation ideas, but I
don't imagine they've got something that can't be specialised for
better performence.

Have a nice day,
-- 
Martijn van Oosterhout   klep...@svana.org   http://svana.org/kleptog/
 He who writes carelessly confesses thereby at the very outset that he does
 not attach much importance to his own thoughts.
   -- Arthur Schopenhauer


signature.asc
Description: Digital signature


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 10:19 PM, Kevin Grittner
kevin.gritt...@wicourts.gov wrote:
 Alvaro Herrera alvhe...@commandprompt.com wrote:

 If you get away with a new page format, let's make sure and
 coordinate so that we can add more info into the header.  One
 thing I wanted was to have an ID struct on each file, so that you
 know what DB/relation/segment the file corresponds to.  So the
 first page's special space would be a bit larger than the others.

 Couldn't that also be done by burning a bit in the page header
 flags, without a page layout version bump?  If that were done, you
 wouldn't have the additional information on tables converted by
 pg_upgrade, but you would get them on new tables, including those
 created by pg_dump/psql conversions.  Adding them could even be made
 conditional, although I don't know whether that's a good idea

These are good thoughts because they overcome the major objection to
doing *anything* here for 9.2.

We don't need to use any flag bits at all. We add
PG_PAGE_LAYOUT_VERSION to the control file, so that CRC checking
becomes an initdb option. All new pages can be created with
PG_PAGE_LAYOUT_VERSION from the control file. All existing pages must
be either the layout version from this release (4) or the next version
(5). Page validity then becomes version dependent.

pg_upgrade still works.

Layout 5 is where we add CRCs, so its basically optional.

We can also have a utility that allows you to bump the page version
for all new pages, even after you've upgraded, so we may end with a
mix of page layout versions in the same relation. That's more
questionable but I see no problem with it.

Do we need CRCs as a table level option? I hope not. That complicates
many things.

All of this allows us to have another more efficient page version (6)
in future without problems, so its good infrastructure.

I'm now personally game on to make something work here for 9.2.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Tom Lane
David Fetter da...@fetter.org writes:
 There's a separate issue we'd like to get clear on, which is whether
 it would be OK to make a new PG_PAGE_LAYOUT_VERSION.

If you're not going to provide pg_upgrade support, I think there is no
chance of getting a new page layout accepted.  The people who might want
CRC support are pretty much exactly the same people who would find lack
of pg_upgrade a showstopper.

Now, given the hint bit issues, I rather doubt that you can make this
work without a page format change anyway.  So maybe you ought to just
bite the bullet and start working on the pg_upgrade problem, rather than
imagining you will find an end-run around it.

 The issue is that double writes needs a checksum to work by itself,
 and page checksums more broadly work better when there are double
 writes, obviating the need to have full_page_writes on.

Um.  So how is that going to work if checksums are optional?

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 7:35 PM, Greg Smith g...@2ndquadrant.com wrote:

 And there's even more radical changes in btrfs, since it wasn't starting
 with a fairly robust filesystem as a base.  And putting my tin foil hat on,
 I don't feel real happy about assuming *the* solution for this issue in
 PostgreSQL is the possibility of a filesystem coming one day when that work
 is being steered by engineers who work at Oracle.

Agreed.

I do agree with Heikki that it really ought to be the OS problem, but
then we thought that about dtrace and we're still waiting for that or
similar to be usable on all platforms (+/- 4 years).

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] RangeVarGetRelid()

2011-12-21 Thread Noah Misch
On Wed, Dec 21, 2011 at 03:16:39PM -0500, Robert Haas wrote:
 On Tue, Dec 20, 2011 at 8:14 PM, Noah Misch n...@leadboat.com wrote:
  RangeVarCallbackForAlterRelation() does not preserve ALTER TABLE's refusal 
  to
  operate on foreign tables.
 
 I should probably fix that, but I'm wondering if I ought to fix it by
 disallowing the use of ALTER TABLE on FOREIGN TABLEs for the other
 commands that share the callback as well.  Allowing ALTER TABLE to
 apply to any relation type is mostly a legacy thing, I think, and any
 code that's new enough to know about foreign tables isn't old enough
 to know about the time when you had to use ALTER TABLE to rename
 views.

Maybe.  Now that we have a release with these semantics, I'd lean toward
preserving the wart and being more careful next time.  It's certainly a
borderline case, though.

  RangeVarCallbackForAlterRelation() does not preserve the check for 
  unexpected
  object types.
 
 I don't feel a strong need to retain that.

Okay.

  utility.c doesn't take locks for any other command; parse analysis usually
  does that. ?To preserve that modularity, you could add a bool toplevel
  argument to transformAlterTableStmt(). ?Pass true here, false in
  ATPostAlterTypeParse(). ?If true, use AlterTableLookupRelation() to get full
  security checks. ?Otherwise, just call relation_openrv() as now. ?Would that
  be an improvement?
 
 Not sure.  I feel that it's unwise to pass relation names all over the
 backend and assume that nothing will change meanwhile; no locking we
 do will prevent that, at least in the case of search path
 interposition.  Ultimately I think this ought to be restructured
 somehow so that we look up each name ONCE and ever-after refer only to
 the resulting OID (except for error message text).  But I'm not sure
 how to do that, and thought it might make sense to commit this much
 independently of such a refactoring.

I agree with all that, though my suggestion would not have increased the
number of by-name lookups.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Tom Lane
Simon Riggs si...@2ndquadrant.com writes:
 We don't need to use any flag bits at all. We add
 PG_PAGE_LAYOUT_VERSION to the control file, so that CRC checking
 becomes an initdb option. All new pages can be created with
 PG_PAGE_LAYOUT_VERSION from the control file. All existing pages must
 be either the layout version from this release (4) or the next version
 (5). Page validity then becomes version dependent.

 We can also have a utility that allows you to bump the page version
 for all new pages, even after you've upgraded, so we may end with a
 mix of page layout versions in the same relation. That's more
 questionable but I see no problem with it.

It seems like you've forgotten all of the previous discussion of how
we'd manage a page format version change.

Having two different page formats running around in the system at the
same time is far from free; in the worst case it means that every single
piece of code that touches pages has to know about and be prepared to
cope with both versions.  That's a rather daunting prospect, from a
coding perspective and even more from a testing perspective.  Maybe
the issues can be kept localized, but I've seen no analysis done of
what the impact would be or how we could minimize it.  I do know that
we considered the idea and mostly rejected it a year or two back.

A utility to bump the page version is equally a whole lot easier said
than done, given that the new version has more overhead space and thus
less payload space than the old.  What does it do when the old page is
too full to be converted?  Move some data somewhere else might be
workable for heap pages, but I'm less sanguine about rearranging indexes
like that.  At the very least it would imply that the utility has full
knowledge about every index type in the system.

 I'm now personally game on to make something work here for 9.2.

If we're going to freeze 9.2 in the spring, I think it's a bit late
for this sort of work to be just starting.  What you've just described
sounds to me like possibly a year's worth of work.

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 11:43 PM, Tom Lane t...@sss.pgh.pa.us wrote:

 It seems like you've forgotten all of the previous discussion of how
 we'd manage a page format version change.

Maybe I've had too much caffeine. It's certainly late here.

 Having two different page formats running around in the system at the
 same time is far from free; in the worst case it means that every single
 piece of code that touches pages has to know about and be prepared to
 cope with both versions.  That's a rather daunting prospect, from a
 coding perspective and even more from a testing perspective.  Maybe
 the issues can be kept localized, but I've seen no analysis done of
 what the impact would be or how we could minimize it.  I do know that
 we considered the idea and mostly rejected it a year or two back.

I'm looking at that now.

My feeling is it probably depends upon how different the formats are,
so given we are discussing a 4 byte addition to the header, it might
be doable.

I'm investing some time on the required analysis.

 A utility to bump the page version is equally a whole lot easier said
 than done, given that the new version has more overhead space and thus
 less payload space than the old.  What does it do when the old page is
 too full to be converted?  Move some data somewhere else might be
 workable for heap pages, but I'm less sanguine about rearranging indexes
 like that.  At the very least it would imply that the utility has full
 knowledge about every index type in the system.

I agree, rewriting every page is completely out and I never even considered it.

 I'm now personally game on to make something work here for 9.2.

 If we're going to freeze 9.2 in the spring, I think it's a bit late
 for this sort of work to be just starting.

I agree with that. If this goes adrift it will have to be killed for 9.2.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Rob Wultsch
On Wed, Dec 21, 2011 at 1:59 PM, David Fetter da...@fetter.org wrote:
 One of the things VMware is working on is double writes, per previous
 discussions of how, for example, InnoDB does things.

The world is moving to flash, and the lifetime of flash is measured
writes. Potentially doubling the number of writes is potentially
halving the life of the flash.

Something to think about...

-- 
Rob Wultsch
wult...@gmail.com

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 4:17 PM, Simon Riggs si...@2ndquadrant.com wrote:
 Partitioning will give us more buffers and more LWlocks, to spread the
 contention when we access the buffers. I use that word because its
 what we call the technique already used in the buffer manager and lock
 manager. If you wish to call this less than fully-associative I
 really don't mind, as long as we're discussing the same overall
 concept, so we can then focus on an implementation of that concept,
 which no doubt has many ways of doing it.

 More buffers per lock does reduce the lock contention somewhat, but
 not by much. So for me, it seems essential that we have more LWlocks
 to solve the problem, which is where partitioning comes in.

 My perspective is that there is clog contention in many places, not
 just in the ones you identified.

Well, that's possible.  The locking in slru.c is pretty screwy and
could probably benefit from better locking granularity.  One point
worth noting is that the control lock for each SLRU protects all the
SLRU buffer mappings and the contents of all the buffers; in the main
buffer manager, those responsibilities are split across
BufFreelistLock, 16 buffer manager partition locks, one content lock
per buffer, and the buffer header spinlocks.  (The SLRU per-buffer
locks are the equivalent of the I/O-in-progresss locks, not the
content locks.)  So splitting up CLOG into multiple SLRUs might not be
the only way of improving the lock granularity; the current situation
is almost comical.

But on the flip side, I feel like your discussion of the problems is a
bit hand-wavy.  I think we need some real test cases that we can look
at and measure, not just an informal description of what we think is
happening.  I'm sure, for example, that repeatedly reading different
CLOG pages costs something - but I'm not sure that it's enough to have
a material impact on performance.  And if it doesn't, then we'd be
better off leaving it alone and working on things that do.  And if it
does, then we need a way to assess how successful any given approach
is in addressing that problem, so we can decide which of various
proposed approaches is best.

 * We allocate a new clog page every 32k xids. At the rates you have
 now measured, we will do this every 1-2 seconds.

And a new pg_subtrans page quite a bit more frequently than that.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Robert Haas
On Wed, Dec 21, 2011 at 7:06 PM, Simon Riggs si...@2ndquadrant.com wrote:
 My feeling is it probably depends upon how different the formats are,
 so given we are discussing a 4 byte addition to the header, it might
 be doable.

I agree.  When thinking back on Zoltan's patches, it's worth
remembering that he had a number of pretty bad ideas mixed in with the
good stuff - such as taking a bunch of things that are written as
macros for speed, and converting them to function calls.  Also, he
didn't make any attempt to isolate the places that needed to know
about both page versions; everybody knew about everything, everywhere,
and so everything needed to branch in places where it had not needed
to do so before.  I don't think we should infer from the failure of
those patches that no one can do any better.

On the other hand, I also agree with Tom that the chances of getting
this done in time for 9.2 are virtually zero, assuming that (1) we
wish to ship 9.2 in 2012 and (2) we don't wish to be making
destabilizing changes beyond the end of the last CommitFest.  There is
a lot of work here, and I would be astonished if we could wrap it all
up in the next month.  Or even the next four months.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] Typed hstore proposal

2011-12-21 Thread Johann 'Myrkraverk' Oskarsson
Hi all,

I mean to create a typed hstore, called tstore for now.  I'm open to
name suggestions.  It'll only support a subset of core Postgres types
to begin with.  Keys are always text, it's the value that's typed.

Usage is very similar to hstore; this is not a complete reference.

  tstore( text, text)   Creates a text key-value pair.

  tstore( text, int4 )  Creates an integer key-value pair.

  tstore - textReturns a tvalue, which is basically an
oid-value pair.

  typeof( tvalue )  Returns the oid.

  tvalue::int4  The integer value.

  tvalue::text  The text value.

  each_int( tstore )Set of all keys and values where the value
is int4.

  each_text( tstore )   Set of all keys and values where the value
is text.

  each( tstore )Set of all keys and values as tvalues.

Some examples:

# select 'text: a'::tvalue;
  tvalue
--
 text: a
(1 row)

# select 'integer: 17'::tvalue;
  tvalue
--
 int4: 17
(1 row)

# select each_int( 'a - text: b, b - int: 17'::tstore );
 ?column? | ?column?
--+--
 b  |   17
(1 row)

#select 'a - text: b, b - int: 17'::tstore - 'a'; 
  ?column? 
---
 text: a
(1 row)


All comments are welcome.  As the project matures, I'm sure other
needed functions will crop up.


-- 
   Johann Oskarssonhttp://www.2ndquadrant.com/|[]
   PostgreSQL Development, 24x7 Support, Training and Services  --+--
  |
   Blog: http://my.opera.com/myrkraverk/blog/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Typed hstore proposal

2011-12-21 Thread Christopher Browne
On Wed, Dec 21, 2011 at 8:32 PM, Johann 'Myrkraverk' Oskarsson
joh...@2ndquadrant.com wrote:
 I mean to create a typed hstore, called tstore for now.  I'm open to
 name suggestions.  It'll only support a subset of core Postgres types
 to begin with.  Keys are always text, it's the value that's typed.

Unfortunately, I'm not sure it'll be of much interest unless it heads
all the way to having nested data.

JSON is the thing of the day that it would be desirable for this to
be potent enough to represent, and JSON has the following types:

1.  Number (in practice, FLOAT)
2.  String (UTF-8)
3.  Boolean (t/f)
4.  Array (not necessarily of uniform type
5.  Object (string key, JSON value pairs, unordered)
6.  NULL

#4 and #5 are obviously entirely more hairy.

But it seems pretty likely that people would be keen on additional
implementations until they get those.
-- 
When confronted by a difficult problem, solve it by reducing it to the
question, How would the Lone Ranger handle this?

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread David Fetter
On Wed, Dec 21, 2011 at 04:18:33PM -0800, Rob Wultsch wrote:
 On Wed, Dec 21, 2011 at 1:59 PM, David Fetter da...@fetter.org wrote:
  One of the things VMware is working on is double writes, per
  previous discussions of how, for example, InnoDB does things.
 
 The world is moving to flash, and the lifetime of flash is measured
 writes.  Potentially doubling the number of writes is potentially
 halving the life of the flash.
 
 Something to think about...

Modern flash drives let you have more write cycles than modern
spinning rust, so while yes, there is something happening, it's also
happening to spinning rust, too.

Cheers,
David.
-- 
David Fetter da...@fetter.org http://fetter.org/
Phone: +1 415 235 3778  AIM: dfetter666  Yahoo!: dfetter
Skype: davidfetter  XMPP: david.fet...@gmail.com
iCal: webcal://www.tripit.com/feed/ical/people/david74/tripit.ics

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Simon Riggs
On Thu, Dec 22, 2011 at 12:06 AM, Simon Riggs si...@2ndquadrant.com wrote:

 Having two different page formats running around in the system at the
 same time is far from free; in the worst case it means that every single
 piece of code that touches pages has to know about and be prepared to
 cope with both versions.  That's a rather daunting prospect, from a
 coding perspective and even more from a testing perspective.  Maybe
 the issues can be kept localized, but I've seen no analysis done of
 what the impact would be or how we could minimize it.  I do know that
 we considered the idea and mostly rejected it a year or two back.

 I'm looking at that now.

 My feeling is it probably depends upon how different the formats are,
 so given we are discussing a 4 byte addition to the header, it might
 be doable.

 I'm investing some time on the required analysis.

We've assumed to now that adding a CRC to the Page Header would add 4
bytes, meaning that we are assuming we are taking a CRC-32 check
field. This will change the size of the header and thus break
pg_upgrade in a straightforward implementation. Breaking pg_upgrade is
not acceptable. We can get around this by making code dependent upon
page version, allowing mixed page versions in one executable. That
causes the PageGetItemId() macro to be page version dependent. After
review, altering the speed of PageGetItemId() is not acceptable either
(show me microbenchmarks if you doubt that). In a large minority of
cases the line pointer and the page header will be in separate cache
lines.

As Kevin points out, we have 13 bits spare on the pd_flags of
PageHeader, so we have a little wiggle room there. In addition to that
I notice that pd_pagesize_version itself is 8 bits (page size is other
8 bits packed together), yet we currently use just one bit of that,
since version is 4. Version 3 was last seen in Postgres 8.2, now
de-supported.

Since we don't care too much about backwards compatibility with data
in Postgres 8.2 and below, we can just assume that all pages are
version 4, unless marked otherwise with additional flags. We then use
two separate bits to pd_flags to show PD_HAS_CRC (0x0008 and 0x8000).
We then completely replace the 16 bit version field with a 16-bit CRC
value, rather than a 32-bit value. Why two flag bits? If either CRC
bit is set we assume the page's CRC is supposed to be valid. This
ensures that a single bit error doesn't switch off CRC checking when
it was supposed to be active. I suggest we remove the page size data
completely; if we need to keep that we should mark 8192 bytes as the
default and set bits for 16kB and 32 kB respectively.

With those changes, we are able to re-organise the page header so that
we can add a 16 bit checksum (CRC), yet retain the same size of
header. Thus, we don't need to change PageGetItemId(). We would
require changes to PageHeaderIsValid() and PageInit() only. Making
these changes means we are reducing the number of bits used to
validate the page header, though we are providing a much better way of
detecting page validity, so the change is of positive benefit.

Adding a CRC was a performance concern because of the hint bit
problem, so making the value 16 bits long gives performance where it
is needed. Note that we do now have a separation of bgwriter and
checkpointer, so we have more CPU bandwidth to address the problem.
Adding multiple bgwriters is also possible.

Notably, this proposal makes CRC checking optional, so if performance
is a concern it can be disabled completely.

Which CRC algorithm to choose?
A study of error detection capabilities for random independent bit
errors and burst errors reveals that XOR, two's complement addition,
and Adler checksums are suboptimal for typical network use. Instead,
one's complement addition should be used for networks willing to
sacrifice error detection effectiveness to reduce compute cost,
Fletcher checksum for networks looking for a balance of error
detection and compute cost, and CRCs for networks willing to pay a
higher compute cost for significantly improved error detection.
The Effectiveness of Checksums for Embedded Control Networks,
Maxino, T.C.  Koopman, P.J.,
Dependable and Secure Computing, IEEE Transactions on
Issue Date: Jan.-March 2009
Available here - http://www.ece.cmu.edu/~koopman/pubs/maxino09_checksums.pdf

Based upon that paper, I suggest we use Fletcher-16. The overall
concept is not sensitive to the choice of checksum algorithm however
and the algorithm itself could be another option. F16 or CRC. My poor
understanding of the difference is that F16 is about 20 times cheaper
to calculate, at the expense of about 1000 times worse error detection
(but still pretty good).

16 bit CRCs are not the strongest available, but still support
excellent error detection rates - better than 1 failure in a million,
possibly much better depending on which algorithm and block size.
That's good easily enough to detect our kind of errors.

This idea doesn't 

[HACKERS] Wishlist: parameterizable types

2011-12-21 Thread Joey Adams
This may be ambitious, but it'd be neat if PostgreSQL supported
parameterizable types.  For example, suppose a contrib module defines
a pair type.  It could be used as follows:

CREATE TABLE my_table (
coord pair(float, float)
);

The pair module could define functions like these for constructing
and examining pairs:

create function pair_create(a, b) returns pair(a, b);
create function pair_fst(pair(a, b)) returns a;
create function pair_snd(pair(a, b)) returns b;

Here, each function is polymorphic in two type variables, a and b.  As
far as I know, PostgreSQL only supports one type variable per
function, via the anyelement keyword.  Thus, unless we restrict
ourselves to only one type parameter, parameterizable types wouldn't
be very useful without support for multiple type variables.

PostgreSQL already has a parameterizable type: array.  However, it
would be nontrivial to introduce another such type.  Currently, nearly
every type in PostgreSQL has a corresponding array type.  For example,
in pg_hba, there's money, and there's _money (array of money values).
Continuing with this pattern means we would need something like P*T
entries in pg_hba, where P is the number of type constructors (e.g.
array), and T is the number of base types.

Moreover, the array type isn't truly nestable.  For one, PostgreSQL
considers int[] and int[][][][] as the same type:

 select '{1,2,3}' :: int[][][][];
  int4
-
 {1,2,3}
(1 row)

Also, arrays inside of arrays aren't allowed, only multidimensional arrays:

 select '{{1,2},{3}}' :: int[][];
ERROR:  multidimensional arrays must have array expressions with
matching dimensions
LINE 1: select '{{1,2},{3}}' :: int[][];

Suppose I didn't like these restrictions on the array type, and wanted
to make a type called vector that addresses them.  It might be used
as follows:

 select '{{1,2},{3}}' :: vector(vector(int));

Note that I'm stacking the 'vector' type constructor.  The array type
doesn't let you do that.

We could stretch the idea even further, and allow parameters and
recursion in user-defined types:

create type object(a) as (
key text,
value either(a, object(a))
);

C++ supports parameterizable types through templates.  Java and C#
support them through generics.

What I have in mind comes from Haskell's type system (which inspired
generics in Java).  The functions defined earlier could be implemented
in Haskell as:

data Pair a b = Pair a b

pair_fst :: Pair a b - a
pair_fst (Pair a _) = a

pair_snd :: Pair a b - b
pair_snd (Pair _ b) = b

What I'm wondering is: how complex would it be to add such a feature
to PostgreSQL's type system?

- Joey

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Pause at end of recovery

2011-12-21 Thread Simon Riggs
On Wed, Dec 21, 2011 at 12:04 PM, Magnus Hagander mag...@hagander.net wrote:
 On Tue, Dec 20, 2011 at 18:15, Simon Riggs si...@2ndquadrant.com wrote:
 On Tue, Dec 20, 2011 at 1:40 PM, Magnus Hagander mag...@hagander.net wrote:
 These days we have pause_at_recovery_target, which lets us pause when
 we reach a PITR target. Is there a particular reason we don't have a
 way to pause at end of recovery if we *didn't* specify a target -
 meaning we let it run until the end of the archived log? While it's
 too late to change the target, I can see a lot of usescases where you
 don't want it to be possible to make changes to the database again
 until it has been properly verified - and keeping it up in readonly
 mode in that case can be quite useful...

 Useful for what purpose? It' s possible to deny access in other ways already.

 For validating the restore, while allowing easy read-only access.

 If you could declare a read-only connection in pg_hba.conf it would
 give the same functionality, but you really can't...

 I'm not saying it's a big feature. But the way it looks now it seems
 to be artificially restricted from a usecase. Or is there a technical
 reason why we don't allow it?

I can see a reason to do this now. I've written patch and will commit
on Friday. Nudge me if I don't.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] CLOG contention

2011-12-21 Thread Simon Riggs
On Thu, Dec 22, 2011 at 12:28 AM, Robert Haas robertmh...@gmail.com wrote:

 But on the flip side, I feel like your discussion of the problems is a
 bit hand-wavy.  I think we need some real test cases that we can look
 at and measure, not just an informal description of what we think is
 happening.

I understand why you say that and take no offence. All I can say is
last time I has access to a good test rig and well structured
reporting and analysis I was able to see evidence of what I described
to you here.

I no longer have that access, which is the main reason I've not done
anything in the last few years. We both know you do have good access
and that's the main reason I'm telling you about it rather than just
doing it myself.


 * We allocate a new clog page every 32k xids. At the rates you have
 now measured, we will do this every 1-2 seconds.

 And a new pg_subtrans page quite a bit more frequently than that.

It is less of a concern, all the same. In most cases we can simply
drop pg_subtrans pages (though we don't do that as often as we could),
no fsync is required on write, no WAL record required for extension
and no update required at commit.

-- 
 Simon Riggs   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Page Checksums + Double Writes

2011-12-21 Thread Heikki Linnakangas

On 22.12.2011 01:43, Tom Lane wrote:

A utility to bump the page version is equally a whole lot easier said
than done, given that the new version has more overhead space and thus
less payload space than the old.  What does it do when the old page is
too full to be converted?  Move some data somewhere else might be
workable for heap pages, but I'm less sanguine about rearranging indexes
like that.  At the very least it would imply that the utility has full
knowledge about every index type in the system.


Remembering back the old discussions, my favorite scheme was to have an 
online pre-upgrade utility that runs on the old cluster, moving things 
around so that there is enough spare room on every page. It would do 
normal heap updates to make room on heap pages (possibly causing 
transient serialization failures, like all updates do), and split index 
pages to make room on them. Yes, it would need to know about all index 
types. And it would set a global variable to indicate that X bytes must 
be kept free on all future updates, too.


Once the pre-upgrade utility has scanned through the whole cluster, you 
can run pg_upgrade. After the upgrade, old page versions are converted 
to new format as pages are read in. The conversion is staightforward, as 
there the pre-upgrade utility ensured that there is enough spare room on 
every page.


--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: GiST for range types (was Re: [HACKERS] Range Types - typo + NULL string constructor)

2011-12-21 Thread Jeff Davis
On Wed, 2011-12-14 at 01:04 +0400, Alexander Korotkov wrote:
 Hi!
 
Thank you! Attached a few changes:

* Change ordinal to normal for clarity (at least to me).
* Some comment cleanup
* Change classes_groups to be an enum of SPLIT_LEFT and SPLIT_RIGHT,
rather than using 1 and 2.
* Changed the bounds_lower and bounds_upper variables into
by_lower and by_upper to indicate that the arrays are distinguished
by sort order. It was confusing me to read it otherwise.

A few comments:

* In range_gist_picksplit, it would be nice to have a little bit more
intuitive description of what's going on with the nonEmptyCount and
nonInfCount numbers. For instance, it appears to depend on the fact that
a range must either be in nonEmptyCount or in nonInfCount. Also, can you
describe the reason you're multiplying by two and taking the absolute
value? It seems to work, but I am missing the intuition behind those
operations.

* The penalty function is fairly hard to read still. At a high level, I
think we're trying to accomplish a few things (in order from most to
least important):
  - Keep normal ranges separate.
  - Avoid broadening the class of the original predicate (e.g. turning
single-infinite into double-infinite).
  - Avoid broadening (as determined by subtype_diff) the original
predicate.
  - Favor adding ranges to narrower original predicates.

Do you agree? If so, perhaps we can attack those more directly and it
might be a little more readable.

Additionally, the arbitrary numbers might become a problem. Can we
choose better constants there? They would still be arbitrary when
compared with real numbers derived from subtype_diff, but maybe we can
still do better than what's there.

* Regarding the leftover common entries that can go to either side:
what is the delta measure trying to accomplish? When I work through
some examples, it seems to favor putting larger common ranges on the
left (small delta) and smaller common ranges on the right (smaller
delta). Why is that good? Or did I misread the code?

Intuitively, I would think that we'd want to push the ranges with lower
upper bounds to the left and higher lower bounds to the right -- in
other words, recurse. Obviously, we'd need to make sure it terminated at
some point, but splitting the common entries does seem like a smaller
version of the original problem. Thoughts?

Thank you for the helpful comments! It took me a while to work through
the logic, but I would have been lost completely without the comments
around the double sorting split.

Regards,
Jeff Davis
*** a/src/backend/utils/adt/rangetypes_gist.c
--- b/src/backend/utils/adt/rangetypes_gist.c
***
*** 39,45 
  	((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
  			 false, -1)))
  
! /* Minimum accepted ratio of split */
  #define LIMIT_RATIO 0.3
  
  /* Helper macros to place an entry in the left or right group */
--- 39,49 
  	((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
  			 false, -1)))
  
! /*
!  * Minimum accepted ratio of split for items of the same class. If the items
!  * are of different classes, it will separate along those lines regardless of
!  * the ratio.
!  */
  #define LIMIT_RATIO 0.3
  
  /* Helper macros to place an entry in the left or right group */
***
*** 66,72 
   * GiST. Each unique combination of properties is a class. CLS_EMPTY cannot be
   * combined with anything else.
   */
! #define CLS_ORDINAL			0 /* Ordinal ranges (no bits set) */
  #define CLS_LOWER_INF		1 /* Lower bound is infinity */
  #define CLS_UPPER_INF		2 /* Upper bound is infinity */
  #define CLS_CONTAIN_EMPTY	4 /* Contains underlying empty ranges */
--- 70,76 
   * GiST. Each unique combination of properties is a class. CLS_EMPTY cannot be
   * combined with anything else.
   */
! #define CLS_NORMAL			0 /* Normal ranges (no bits set) */
  #define CLS_LOWER_INF		1 /* Lower bound is infinity */
  #define CLS_UPPER_INF		2 /* Upper bound is infinity */
  #define CLS_CONTAIN_EMPTY	4 /* Contains underlying empty ranges */
***
*** 76,81 
--- 80,102 
  			   * of properties. CLS_EMPTY doesn't combine with
  			   * anything else, so it's only 2^3 + 1. */
  
+ /*
+  * Auxiliary structure for picksplit based on single sorting.
+  */
+ typedef struct
+ {
+ 	int	index;
+ 	RangeBound			bound;
+ 	TypeCacheEntry	   *typcache;
+ } PickSplitSortItem;
+ 
+ /* place on left or right side of split? */
+ typedef enum
+ {
+ 	SPLIT_LEFT = 0, /* makes initialization to SPLIT_LEFT easier */
+ 	SPLIT_RIGHT
+ } SplitLR;
+ 
  static RangeType *range_super_union(TypeCacheEntry *typcache, RangeType *r1,
  	RangeType *r2);
  static bool range_gist_consistent_int(FmgrInfo *flinfo,
***
*** 97,103  static int sort_item_cmp(const void *a, const void *b);
  static void range_gist_class_split(TypeCacheEntry *typcache,
     GistEntryVector *entryvec,
     GIST_SPLITVEC *v,
!    

Re: GiST for range types (was Re: [HACKERS] Range Types - typo + NULL string constructor)

2011-12-21 Thread Jeff Davis
On Tue, 2011-12-20 at 13:22 +0400, Alexander Korotkov wrote:
 Hi!
 
 
 Studying this question little more I found that current approach of
 range indexing can be dramatically inefficient in some cases. It's not
 because of penalty or split implementation, but because of approach
 itself. Mapping intervals to two-dimensional space produce much better
 results in case of high-overlapping ranges and @, @ operators
 with low selectivity. 
 
Thank you for testing this. I agree that your approach is much better
especially dealing with widely varying range sizes, etc. My approach
really only tackled the simple (and hopefully common) case when the
ranges are about the same size.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers