Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-03-19 Thread Christian Kruse
Hi,

On 19/03/14 15:12, Alvaro Herrera wrote:
 I hope the silence meant assent, because I have pushed this patch now.

Great, thanks!

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpYfuVjR_gg_.pgp
Description: PGP signature


Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-03-17 Thread Christian Kruse
Hi Amit,

I've been ill the last few days, so sorry for my late response.

 I have updated the patch to pass TID and operation information in
 error context and changed some of the comments in code.
 Let me know if the added operation information is useful, else
 we can use better generic message in context.

I don't think that this fixes the translation guideline issues brought
up by Robert. This produces differing strings for the different cases
as well and it also passes in altering data directly to the error
string.

It also may produce error messages that are really weird. You
initialize the string with while attempting to . The remaining part
of the function is covered by if()s which may lead to error messages
like this:

„while attempting to “
„while attempting to in relation public.xyz of database abc“
„while attempting to in database abc“

Although this may not be very likely (because
ItemPointerIsValid((info-ctid))) should in this case not return
false).

Attached you will find a new version of this patch; it slightly
violates the translation guidelines as well: it assembles an error
string (but it doesn't pass in altering data like ctid or things like
that). I simply couldn't think of a nice solution without doing so,
and after looking through the code there are a few cases
(e.g. CheckTableNotInUse()) where this is done, too. If we insist of
having complete strings in this case we would have to have 6 * 3 * 2
error strings in the code.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index e2337ac..c0f5881 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -107,6 +107,8 @@ static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
 		uint16 t_infomask);
 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 int *remaining, uint16 infomask);
+static void MultiXactIdWaitExtended(Relation rel, ItemPointerData ctid, const char *opr_name,
+	MultiXactId multi, MultiXactStatus status, int *remaining, uint16 infomask);
 static bool ConditionalMultiXactIdWait(MultiXactId multi,
 		   MultiXactStatus status, int *remaining,
 		   uint16 infomask);
@@ -2714,8 +2716,9 @@ l1:
 		if (infomask  HEAP_XMAX_IS_MULTI)
 		{
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
-			NULL, infomask);
+			MultiXactIdWaitExtended(relation, tp.t_data-t_ctid, delete,
+	(MultiXactId) xwait,
+	MultiXactStatusUpdate, NULL, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2741,7 +2744,8 @@ l1:
 		else
 		{
 			/* wait for regular transaction to end */
-			XactLockTableWait(xwait);
+			XactLockTableWaitExtended(relation, tp.t_data-t_ctid,
+	  delete, xwait);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3266,8 +3270,8 @@ l2:
 			int			remain;
 
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, mxact_status, remain,
-			infomask);
+			MultiXactIdWaitExtended(relation, oldtup.t_data-t_ctid, update,
+	   (MultiXactId) xwait, mxact_status, remain, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3306,7 +3310,7 @@ l2:
 			/*
 			 * There was no UPDATE in the MultiXact; or it aborted. No
 			 * TransactionIdIsInProgress() call needed here, since we called
-			 * MultiXactIdWait() above.
+			 * MultiXactIdWaitExtended() above.
 			 */
 			if (!TransactionIdIsValid(update_xact) ||
 TransactionIdDidAbort(update_xact))
@@ -3341,7 +3345,8 @@ l2:
 			else
 			{
 /* wait for regular transaction to end */
-XactLockTableWait(xwait);
+XactLockTableWaitExtended(relation, oldtup.t_data-t_ctid,
+		  update, xwait);
 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 /*
@@ -4409,7 +4414,10 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
-	MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
+	MultiXactIdWaitExtended(relation, tuple-t_data-t_ctid,
+			lock, (MultiXactId) xwait,
+			status, NULL,
+			infomask);
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -4464,7 +4472,8 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
-	XactLockTableWait(xwait);
+	XactLockTableWaitExtended(relation, tuple-t_data-t_ctid,
+			  lock, xwait);
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -5151,7 +5160,9 @@ l4:
 	if (needwait)
 	{
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		XactLockTableWait(members[i].xid);
+		XactLockTableWaitExtended(rel, mytup.t_data-t_ctid,
+  lock updated,
+  members[i].xid);
 		pfree(members);
 		goto l4;
 	}
@@ -5211,7 +5222,8 @@ l4:
 if (needwait)
 {
 	LockBuffer(buf

Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-03-12 Thread Christian Kruse
Hi,

On 13/03/14 03:27, Fujii Masao wrote:
 Committed!

Thank you very much!

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpkDoVMmXIL4.pgp
Description: PGP signature


Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-03-11 Thread Christian Kruse
Hi,

On 11/03/14 13:23, Amit Kapila wrote:
 [… snip …]
 So I think it's better to leave logging it as you have done in
 patch.

Agreed.

 […]
 2. Name new functions as  
 MultiXactIdWaitExtended()/XactLockTableWaitExtended()
 or MultiXactIdWaitEx()/XactLockTableWaitEx().
 You can find some other similar functions (ReadBufferExtended,
 relation_openrv_extended)
 
 3. MultiXactIdWaitWithInfo()/XactLockTableWaitWithInfo()
 
 Earlier I found option 3 as a good choice, but now again thinking on
 it I am leaning towards option 2.

Changing it once again will only cause more work and won't do a big
difference. So I suggest sticking with the current function names.

 Today, again looking into it, I found that function
 heap_lock_updated_tuple_rec() is using SnapshotAny to fetch the tuple
 and I think for this case also it's not safe to Log the tuple.
 
 Could you please once check (if you are comfortable doing so) wherever
 this patch is passing tuple, whether it is okay to pass it based on visibility
 rules, else I will again verify it once.

I think I got all places, but it would be nice to have a confirmation.

  [… policy regarding whitespaces …]
 The simple rule I follow is don't touch the code which has no relation
 to current patch.

OK. Thanks.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 71ec740..bcc656a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -107,6 +107,8 @@ static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
 		uint16 t_infomask);
 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 int *remaining, uint16 infomask);
+static void MultiXactIdWaitWithInfo(Relation rel, HeapTuple tup, MultiXactId multi,
+	MultiXactStatus status, int *remaining, uint16 infomask);
 static bool ConditionalMultiXactIdWait(MultiXactId multi,
 		   MultiXactStatus status, int *remaining,
 		   uint16 infomask);
@@ -2714,8 +2716,8 @@ l1:
 		if (infomask  HEAP_XMAX_IS_MULTI)
 		{
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
-			NULL, infomask);
+			MultiXactIdWaitWithInfo(relation, tp, (MultiXactId) xwait,
+	MultiXactStatusUpdate, NULL, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2741,7 +2743,7 @@ l1:
 		else
 		{
 			/* wait for regular transaction to end */
-			XactLockTableWait(xwait);
+			XactLockTableWaitWithInfo(relation, tp, xwait);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3266,8 +3268,8 @@ l2:
 			int			remain;
 
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, mxact_status, remain,
-			infomask);
+			MultiXactIdWaitWithInfo(relation, oldtup,
+	   (MultiXactId) xwait, mxact_status, remain, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3306,7 +3308,7 @@ l2:
 			/*
 			 * There was no UPDATE in the MultiXact; or it aborted. No
 			 * TransactionIdIsInProgress() call needed here, since we called
-			 * MultiXactIdWait() above.
+			 * MultiXactIdWaitWithInfo() above.
 			 */
 			if (!TransactionIdIsValid(update_xact) ||
 TransactionIdDidAbort(update_xact))
@@ -3341,7 +3343,7 @@ l2:
 			else
 			{
 /* wait for regular transaction to end */
-XactLockTableWait(xwait);
+XactLockTableWaitWithInfo(relation, oldtup, xwait);
 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 /*
@@ -4409,7 +4411,8 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
-	MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
+	MultiXactIdWaitWithInfo(relation, tuple,
+(MultiXactId) xwait, status, NULL, infomask);
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -4464,7 +4467,7 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
-	XactLockTableWait(xwait);
+	XactLockTableWaitWithInfo(relation, tuple, xwait);
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -5151,7 +5154,7 @@ l4:
 	if (needwait)
 	{
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		XactLockTableWait(members[i].xid);
+		XactLockTableWaitWithInfo(rel, NULL, members[i].xid);
 		pfree(members);
 		goto l4;
 	}
@@ -5211,7 +5214,7 @@ l4:
 if (needwait)
 {
 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-	XactLockTableWait(rawxmax);
+	XactLockTableWaitWithInfo(rel, NULL, rawxmax);
 	goto l4;
 }
 if (res != HeapTupleMayBeUpdated)
@@ -6169,6 +6172,33 @@ MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 }
 
 /*
+ * MultiXactIdWaitWithInfo
+ *		Sets up the error context callback for reporting tuple
+ *		information and relation for a lock to aquire
+ *
+ * Use this instead of calling MultiXactIdWait()
+ */
+static void
+MultiXactIdWaitWithInfo

Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-03-10 Thread Christian Kruse
Hi,

thanks for the continued review.

On 09/03/14 12:15, Amit Kapila wrote:
 1.
  ISTM that you should be printing just the value and the unique index
  there, and not any information about the tuple proper.
 
 Good point, I will have a look at this.
 
 This point is still not handled in patch, […]

There have been some complaints about this by Andres, so I left it.

 […] due to which the message it displays seems to be
 incomplete. Message it displays is as below:
 
 LOG:  process 1800 still waiting for ShareLock on transaction 679 after 
 1014.000
  ms
 CONTEXT:  while attempting to lock in relation public.idx_t1 of database 
 pos
 tgres

Well, there is no tuple information available, so we have to leave it out.

 Here it is not clear what it attempts *lock in*

Ok, I reworded the message as you suggested below. Should make this
clearer as well.

 2. In function XactLockTableWaitErrorContextCallback(), ignore dropped
 columns in tuple, else it will lead to following failure:
 […]
 Problem is in Session-2 (cache lookup failed), when it tries to print values
 for dropped attribute.

Urghs. I really thought I had done this in the patch before. Thanks
for pointing out, fixed in attached patch.

 3.
  in relation \%s\.\%s\ of database %s,
 Why to display only relation name in quotes?
 I think database name should also be displayed in quotes.

Didn't think that it would make sense; the quoting makes sense for the
schema and relation name, but I did not see any need to quote the
database name. However, fixed in attached patch.

 4.
 Context message:
 while attempting to lock tuple (0,2) .
 
 I think it will be better to rephrase it (may be by using 'operate on'
 instead of 'lock').
 The reason is that actually we trying to lock on transaction, so it doesn't 
 make
 good sense to use lock on tuple

Fixed.

 5.
 + XactLockTableWaitWithInfo(relation, tp, xwait);
 
 + MultiXactIdWaitWithErrorContext(relation, tp, (MultiXactId) xwait,
 +  MultiXactStatusUpdate, NULL, infomask);
 
 I think we should make the name of MultiXactIdWaitWithErrorContext()
 similar to XactLockTableWaitWithInfo.

Fixed as well.

Can you explain why you prefer the WithInfo naming? Just curious, it
seemed to me the more descriptive name should have been preferred.

 6.
 @@ -1981,7 +1981,8 @@ EvalPlanQualFetch(EState *estate, Relation
 relation, int lockmode,
   if (TransactionIdIsValid(SnapshotDirty.xmax))
   {
   ReleaseBuffer(buffer);
 - XactLockTableWait(SnapshotDirty.xmax);
 + XactLockTableWaitWithInfo(relation, tuple,
 +  SnapshotDirty.xmax);
 
 In function EvalPlanQualFetch(), we are using SnapshotDirty to fetch
 the tuple, so I think there is a chance that it will log the tuple which
 otherwise will not be visible. I don't think this is right.

Hm, after checking source I tend to agree. I replaced it with NULL.

 8.
  void
  WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode)
  {
 - List   *l;
 + List   *l;
 
 Extra spacing not needed.

What is the policy to do here? The extra spaces have been added by
pg_indent, and there have been more changes in a couple of other files
which I had to drop manually as well. How should this been handled
generally?

 9.
 /*
  * XactLockTableWaitErrorContextCallback
  * error context callback set up by
  * XactLockTableWaitWithInfo. It reports some
  * tuple information and the relation of a lock to aquire
  *
  */
 Please improve indentation of above function header

Heh. Seems that Emacs' fill-paragraph messed this up. Thanks, fixed.

 10.
 +#include access/htup.h
 +
 +struct XactLockTableWaitLockInfo
 +{
 + Relation rel;
 + HeapTuple tuple;
 +};
 
 I think it is better to add comments for this structure.
 You can refer comments for struct HeapUpdateFailureData

Fixed.

 
 11.
 + *
 + * Use this instead of calling XactTableLockWait()
 
 In above comment, function name used is wrong and I am not sure this
 is right statement to use because still we are using XactLockTableWait()
 at few places like in function Do_MultiXactIdWait() […]

Fixed function name, but I'd like to keep the wording;
Do_MultiXactIdWait() is wrapped by MultiXactIdWaitWithInfo() and
therefore sets up the context as well.

 […] and recently this is used in function SnapBuildFindSnapshot().

Hm, should we add the context there, too?

 12.
 heap_update()
 {
 ..
 ..
 /*
  * There was no UPDATE in the MultiXact; or it aborted. No
  * TransactionIdIsInProgress() call needed here, since we called
  * MultiXactIdWait() above.
 
 Change the function name in above comment.

Fixed.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 71ec740..0cf3537 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -107,6 +107,8 @@ static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
 		uint16 t_infomask);
 static void

Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-03-10 Thread Christian Kruse
Hi,

On 10/03/14 14:59, Robert Haas wrote:
 On Mon, Mar 10, 2014 at 7:44 AM, Christian Kruse
 christ...@2ndquadrant.com wrote:
  [ response to review ]
 
 This response seems to have made no mention of point #7 from Amit's
 review, which seems to me to be a rather important one.

Just didn't notice it because the previous point was the same. NULL'd
the tuple there, too:

--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -1307,7 +1307,7 @@ retry:
if (TransactionIdIsValid(xwait))
{
index_endscan(index_scan);
-   XactLockTableWait(xwait);
+   XactLockTableWaitWithInfo(heap, NULL, xwait);
goto retry;
}

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpkcb2npz9m2.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-03-04 Thread Christian Kruse
Hi,

On 03/03/14 21:03, Heikki Linnakangas wrote:
 I spotted this in section 17.4.1 Shared Memory and Semaphores:
 
 Linux
 
 The default maximum segment size is 32 MB, and the default maximum total 
  size is 2097152 pages. A page is almost always 4096 bytes except in unusual 
  kernel configurations with huge pages (use getconf PAGE_SIZE to verify).
 
 It's not any more wrong now than it's always been, but I don't think huge
 pages ever affect PAGE_SIZE... Could I cajole you into rephrasing that, too?

Hm… to be honest, I'm not sure how to change that. What about this?

The default maximum segment size is 32 MB, and the
default maximum total size is 2097152
pages.  A page is almost always 4096 bytes except in
kernel configurations with quotehuge pages/quote
(use literalcat /proc/meminfo | grep Hugepagesize/literal
to verify), but they have to be enabled explicitely via
xref linkend=guc-huge-pages. See
xref linkend=linux-huge-pages for details.

I attached a patch doing this change.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 7f4a235..8811097 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -881,9 +881,12 @@ optionSEMMAP=256
para
 The default maximum segment size is 32 MB, and the
 default maximum total size is 2097152
-pages.  A page is almost always 4096 bytes except in unusual
+pages.  A page is almost always 4096 bytes except in
 kernel configurations with quotehuge pages/quote
-(use literalgetconf PAGE_SIZE/literal to verify).
+(use literalcat /proc/meminfo | grep Hugepagesize/literal
+to verify), but they have to be enabled explicitely via
+xref linkend=guc-huge-pages. See
+xref linkend=linux-huge-pages for details.
/para
 
para


pgpi_wQpDEj0f.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-03-03 Thread Christian Kruse
Hi,

 Attached is a patch with the updated documentation (now uses
 consistently huge pages) as well as a renamed GUC, consistent wording
 (always use huge pages) as well as renamed variables.
 
 Hmm, I wonder if that could now be misunderstood to have something to do
 with the PostgreSQL page size? Maybe add the word memory or operating
 system in the first sentence in the docs, like this: Enables/disables the
 use of huge memory pages.

Accepted, see attached patch.

para
 At present, this feature is supported only on Linux. The setting is
 ignored on other systems when set to literaltry/literal.
 productnamePostgreSQL/productname will
 refuse to start when set to literalon/literal.
/para
 
 Is it clear enough that PostgreSQL will only refuse to start up when it's
 set to on, *if the feature's not supported on the platform*? Perhaps just
 leave that last sentence out. It's mentioned later that  With
 literalon/literal, failure to use huge pages will prevent the server
 from starting up., that's probably enough.

Fixed.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index fa9ee37..065c1db 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1166,14 +1166,14 @@ include 'filename'
   /listitem
  /varlistentry
 
- varlistentry id=guc-huge-tlb-pages xreflabel=huge_tlb_pages
-  termvarnamehuge_tlb_pages/varname (typeenum/type)/term
+ varlistentry id=guc-huge-pages xreflabel=huge_pages
+  termvarnamehuge_pages/varname (typeenum/type)/term
   indexterm
-   primaryvarnamehuge_tlb_pages/ configuration parameter/primary
+   primaryvarnamehuge_pages/ configuration parameter/primary
   /indexterm
   listitem
para
-Enables/disables the use of huge TLB pages. Valid values are
+Enables/disables the use of huge memory pages. Valid values are
 literaltry/literal (the default), literalon/literal,
 and literaloff/literal.
/para
@@ -1181,19 +1181,16 @@ include 'filename'
para
 At present, this feature is supported only on Linux. The setting is
 ignored on other systems when set to literaltry/literal.
-productnamePostgreSQL/productname will
-refuse to start when set to literalon/literal.
/para
 
para
-The use of huge TLB pages results in smaller page tables and
-less CPU time spent on memory management, increasing performance. For
-more details, see
-xref linkend=linux-huge-tlb-pages.
+The use of huge pages results in smaller page tables and less CPU time
+spent on memory management, increasing performance. For more details,
+see xref linkend=linux-huge-pages.
/para
 
para
-With varnamehuge_tlb_pages/varname set to literaltry/literal,
+With varnamehuge_pages/varname set to literaltry/literal,
 the server will try to use huge pages, but fall back to using
 normal allocation if that fails. With literalon/literal, failure
 to use huge pages will prevent the server from starting up. With
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 5f9fa61..7f4a235 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1308,13 +1308,13 @@ echo -1000  /proc/self/oom_score_adj
/note
   /sect2
 
-  sect2 id=linux-huge-tlb-pages
-   titleLinux huge TLB pages/title
+  sect2 id=linux-huge-pages
+   titleLinux huge pages/title
 
para
-Using huge TLB pages reduces overhead when using large contiguous chunks
-of memory, like PostgreSQL does. To enable this feature
-in productnamePostgreSQL/productname you need a kernel
+Using huge pages reduces overhead when using large contiguous chunks of
+memory, like productnamePostgreSQL/productname does. To enable this
+feature in productnamePostgreSQL/productname you need a kernel
 with varnameCONFIG_HUGETLBFS=y/varname and
 varnameCONFIG_HUGETLB_PAGE=y/varname. You also have to tune the system
 setting varnamevm.nr_hugepages/varname. To estimate the number of
@@ -1345,7 +1345,7 @@ $ userinputsysctl -w vm.nr_hugepages=3170/userinput
 productnamePostgreSQL/productname is to use them when possible and
 to fallback to normal pages when failing. To enforce the use of huge
 pages, you can set
-link linkend=guc-huge-tlb-pagesvarnamehuge_tlb_pages/varname/link
+link linkend=guc-huge-pagesvarnamehuge_pages/varname/link
 to literalon/literal. Note that in this case
 productnamePostgreSQL/productname will fail to start if not enough huge
 pages are available.
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 65ad595..51c1a2b 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port

Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-03-03 Thread Christian Kruse
Hi,

On 28/02/14 17:58, Peter Geoghegan wrote:
 On Fri, Feb 28, 2014 at 9:43 AM, Heikki Linnakangas
 hlinnakan...@vmware.com wrote:
  Hmm, I wonder if that could now be misunderstood to have something to do
  with the PostgreSQL page size? Maybe add the word memory or operating
  system in the first sentence in the docs, like this: Enables/disables the
  use of huge memory pages.
 
 Whenever I wish to emphasize that distinction, I tend to use the term
 MMU pages.

I don't like to distinct that much from Linux terminology, this may
lead to confusion. And to use this term only in one place doesn't seem
to make sense, too – naming will then be inconsistent and thus lead to
confusion, too. Do you agree?

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpkad_A3izOE.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-27 Thread Christian Kruse
Hi,

On 27/02/14 08:35, Christian Kruse wrote:
 Hi Peter,

Sorry, Stephen of course – it was definitely to early.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpm6lYpan4Df.pgp
Description: PGP signature


Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-02-27 Thread Christian Kruse
Hi,

On 25/02/14 16:11, Robert Haas wrote:
 On Mon, Feb 24, 2014 at 10:13 AM, Christian Kruse
 christ...@2ndquadrant.com wrote:
  To be honest, I don't like the idea of setting up this error context
  only for log_lock_wait messages. This sounds unnecessary complex to me
  and I think that in the few cases where this message doesn't add a
  value (and thus is useless) don't justify such complexity.
 
 Reading this over, I'm not sure I understand why this is a CONTEXT at
 all and not just a DETAIL for the particular error message that it's
 supposed to be decorating.  Generally CONTEXT should be used for
 information that will be relevant to all errors in a given code path,
 and DETAIL for extra information specific to a particular error.

Because there is more than one scenario where this context is useful,
not just log_lock_wait messages.

 If we're going to stick with CONTEXT, we could rephrase it like this:
 
 CONTEXT: while attempting to lock tuple (1,2) in relation with OID 3456
 
 or when the relation name is known:
 
 CONTEXT: while attempting to lock tuple (1,2) in relation public.foo

Accepted. Patch attached.

  Displaying whole tuple doesn't seem to be the most right way
  to get debug information and especially in this case we are
  already displaying tuple offset(ctid) which is unique identity
  to identify a tuple. It seems to me that it is sufficient to display
  unique value of tuple if present.
  I understand that there is no clear issue here, so may be if others also
  share their opinion then it will be quite easy to take a call.
 
 I wouldn't be inclined to dump the whole tuple under any
 circumstances.  That could be a lot more data than what you want
 dumped in your log.  The PK could already be somewhat unreasonably
 large, but the whole tuple could be a lot more unreasonably large.

Well, as I already stated: we don't. I copied the behavior we use in
CHECK constraints (ExecBuildSlotValueDescription()).

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index a771ccb..a04414e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -107,6 +107,8 @@ static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
 		uint16 t_infomask);
 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 int *remaining, uint16 infomask);
+static void MultiXactIdWaitWithErrorContext(Relation rel, HeapTuple tup, MultiXactId multi,
+	MultiXactStatus status, int *remaining, uint16 infomask);
 static bool ConditionalMultiXactIdWait(MultiXactId multi,
 		   MultiXactStatus status, int *remaining,
 		   uint16 infomask);
@@ -2703,8 +2705,8 @@ l1:
 		if (infomask  HEAP_XMAX_IS_MULTI)
 		{
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
-			NULL, infomask);
+			MultiXactIdWaitWithErrorContext(relation, tp, (MultiXactId) xwait,
+	  MultiXactStatusUpdate, NULL, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2730,7 +2732,7 @@ l1:
 		else
 		{
 			/* wait for regular transaction to end */
-			XactLockTableWait(xwait);
+			XactLockTableWaitWithInfo(relation, tp, xwait);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3255,8 +3257,8 @@ l2:
 			int			remain;
 
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, mxact_status, remain,
-			infomask);
+			MultiXactIdWaitWithErrorContext(relation, oldtup,
+	   (MultiXactId) xwait, mxact_status, remain, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3330,7 +3332,7 @@ l2:
 			else
 			{
 /* wait for regular transaction to end */
-XactLockTableWait(xwait);
+XactLockTableWaitWithInfo(relation, oldtup, xwait);
 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 /*
@@ -4398,7 +4400,8 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
-	MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
+	MultiXactIdWaitWithErrorContext(relation, tuple,
+(MultiXactId) xwait, status, NULL, infomask);
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -4453,7 +4456,7 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
-	XactLockTableWait(xwait);
+	XactLockTableWaitWithInfo(relation, tuple, xwait);
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -5140,7 +5143,7 @@ l4:
 	if (needwait)
 	{
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		XactLockTableWait(members[i].xid);
+		XactLockTableWaitWithInfo(rel, mytup, members[i].xid);
 		pfree(members);
 		goto l4;
 	}
@@ -5200,7 +5203,7 @@ l4:
 if (needwait)
 {
 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-	XactLockTableWait(rawxmax);
+	XactLockTableWaitWithInfo(rel, mytup

Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-26 Thread Christian Kruse
Hi,

On 25/02/14 19:28, Andres Freund wrote:
 I think all that's needed is to cut the first paragraphs that generally
 explain what huge pages are in some detail from the text and make sure
 the later paragraphs don't refer to the earlier ones.

Attached you will find a new version of the patch.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4dc1277..0006090 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1128,10 +1128,7 @@ include 'filename'
 The use of huge TLB pages results in smaller page tables and
 less CPU time spent on memory management, increasing performance. For
 more details, see
-ulink url=https://wiki.debian.org/Hugepages;the Debian wiki/ulink.
-Remember that you will need at least shared_buffers / huge page size +
-1 huge TLB pages. So for example for a system with 6GB shared buffers
-and a hugepage size of 2kb of you will need at least 3156 huge pages.
+xref linkend=linux-huge-tlb-pages.
/para
 
para
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..f172526 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,55 @@ echo -1000  /proc/self/oom_score_adj
/para
/note
   /sect2
+
+  sect2 id=linux-huge-tlb-pages
+   titleLinux huge TLB pages/title
+
+   para
+To enable this feature in productnamePostgreSQL/productname you need a
+kernel with varnameCONFIG_HUGETLBFS=y/varname and
+varnameCONFIG_HUGETLB_PAGE=y/varname. You also have to tune the system
+setting varnamevm.nr_hugepages/varname. To calculate the number of
+necessary huge pages start productnamePostgreSQL/productname without
+huge pages enabled and check the varnameVmPeak/varname value from the
+proc filesystem:
+programlisting
+$ userinputhead -1 /path/to/data/directory/postmaster.pid/userinput
+4170
+$ userinputgrep ^VmPeak /proc/4170/status/userinput
+VmPeak:  6490428 kB
+/programlisting
+ literal6490428/literal / literal2048/literal
+ (varnamePAGE_SIZE/varname is literal2MB/literal in this case) are
+ roughly literal3169.154/literal huge pages, so you will need at
+ least literal3170/literal huge pages:
+programlisting
+$ userinputsysctl -w vm.nr_hugepages=3170/userinput
+/programlisting
+Sometimes the kernel is not able to allocate the desired number of huge
+pages, so it might be necessary to repeat that command or to reboot. Don't
+forget to add an entry to filename/etc/sysctl.conf/filename to persist
+this setting through reboots.
+   /para
+
+   para
+The default behavior for huge pages in
+productnamePostgreSQL/productname is to use them when possible and
+to fallback to normal pages when failing. To enforce the use of huge
+pages, you can set
+link linkend=guc-huge-tlb-pagesvarnamehuge_tlb_pages/varname/link
+to literalon/literal. Note that in this case
+productnamePostgreSQL/productname will fail to start if not enough huge
+pages are available.
+   /para
+
+   para
+For a detailed description of the productnameLinux/productname huge
+pages feature have a look
+at ulink url=https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt;https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt/ulink.
+   /para
+
+  /sect2
  /sect1
 
 


pgpNzko1BF91A.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-26 Thread Christian Kruse
Hi Peter,

after a night of sleep I'm still not able to swallow the pill. To be
honest I'm a little bit angry about this accusation.

I didn't mean to copy from the Debian wiki and after re-checking the
text again I'm still convinced that I didn't.

Of course the text SAYS something similar, but this is in the nature
of things. Structure, diction and focus are different. Also the
information transferred is different and gathered from various
articles, including the Debian wiki, the huge page docs of the kernel,
the Wikipedia and some old IBM and Oracle docs.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpIF57Iw0CqM.pgp
Description: PGP signature


Re: [HACKERS] Patch: show xid and xmin in pg_stat_activity and pg_stat_replication

2014-02-26 Thread Christian Kruse
Hi Robert,

On 25/02/14 12:37, Robert Haas wrote:
 Committed, after fixing the regression test outputs.  I also changed
 the new fields to be NULL rather than 0 when they are invalid, because
 that way applying age() to that column does something useful instead
 of something lame.

Hm, thought I had done that. However, thanks for committing and
fixing!

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpBD5IQLGElj.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-26 Thread Christian Kruse
Hi,

On 26/02/14 14:34, Heikki Linnakangas wrote:
 That still says The setting is ignored on other systems. That's not quite
 true: as explained later in the section, if you set huge_tlb_pages=on and
 the platform doesn't support it, the server will refuse to start.

I added a sentence about it.

 Using huge TLB pages reduces overhead when using large contiguous chunks of
 memory, like PostgreSQL does.

Sentence added.

 That's good advice, but perhaps s/calculate/estimate/. It's just an
 approximation, after all.

Fixed.

New patch version is attached.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4dc1277..c5c2d8b 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1120,18 +1120,17 @@ include 'filename'
/para
 
para
-At present, this feature is supported only on Linux. The setting
-is ignored on other systems.
+At present, this feature is supported only on Linux. The setting is
+ignored on other systems when set to literaltry/literal.
+productnamePostgreSQL/productname will
+refuse to start when set to literalon/literal.
/para
 
para
 The use of huge TLB pages results in smaller page tables and
 less CPU time spent on memory management, increasing performance. For
 more details, see
-ulink url=https://wiki.debian.org/Hugepages;the Debian wiki/ulink.
-Remember that you will need at least shared_buffers / huge page size +
-1 huge TLB pages. So for example for a system with 6GB shared buffers
-and a hugepage size of 2kb of you will need at least 3156 huge pages.
+xref linkend=linux-huge-tlb-pages.
/para
 
para
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..5f9fa61 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,57 @@ echo -1000  /proc/self/oom_score_adj
/para
/note
   /sect2
+
+  sect2 id=linux-huge-tlb-pages
+   titleLinux huge TLB pages/title
+
+   para
+Using huge TLB pages reduces overhead when using large contiguous chunks
+of memory, like PostgreSQL does. To enable this feature
+in productnamePostgreSQL/productname you need a kernel
+with varnameCONFIG_HUGETLBFS=y/varname and
+varnameCONFIG_HUGETLB_PAGE=y/varname. You also have to tune the system
+setting varnamevm.nr_hugepages/varname. To estimate the number of
+necessary huge pages start productnamePostgreSQL/productname without
+huge pages enabled and check the varnameVmPeak/varname value from the
+proc filesystem:
+programlisting
+$ userinputhead -1 /path/to/data/directory/postmaster.pid/userinput
+4170
+$ userinputgrep ^VmPeak /proc/4170/status/userinput
+VmPeak:  6490428 kB
+/programlisting
+ literal6490428/literal / literal2048/literal
+ (varnamePAGE_SIZE/varname is literal2MB/literal in this case) are
+ roughly literal3169.154/literal huge pages, so you will need at
+ least literal3170/literal huge pages:
+programlisting
+$ userinputsysctl -w vm.nr_hugepages=3170/userinput
+/programlisting
+Sometimes the kernel is not able to allocate the desired number of huge
+pages, so it might be necessary to repeat that command or to reboot. Don't
+forget to add an entry to filename/etc/sysctl.conf/filename to persist
+this setting through reboots.
+   /para
+
+   para
+The default behavior for huge pages in
+productnamePostgreSQL/productname is to use them when possible and
+to fallback to normal pages when failing. To enforce the use of huge
+pages, you can set
+link linkend=guc-huge-tlb-pagesvarnamehuge_tlb_pages/varname/link
+to literalon/literal. Note that in this case
+productnamePostgreSQL/productname will fail to start if not enough huge
+pages are available.
+   /para
+
+   para
+For a detailed description of the productnameLinux/productname huge
+pages feature have a look
+at ulink url=https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt;https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt/ulink.
+   /para
+
+  /sect2
  /sect1
 
 


pgpeMqXsSJV2_.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-26 Thread Christian Kruse
Hi,

On 26/02/14 13:13, Alvaro Herrera wrote:
 
 There's one thing that rubs me the wrong way about all this
 functionality, which is that we've named it huge TLB pages.  That is
 wrong -- the TLB pages are not huge.  In fact, as far as I understand,
 the TLB doesn't have pages at all.  It's the pages that are huge, but
 those pages are not TLB pages, they are just memory pages.

I didn't think about this, yet, but you are totally right.

 Since we haven't released any of this, should we discuss renaming it to
 just huge pages?

Attached is a patch with the updated documentation (now uses
consistently huge pages) as well as a renamed GUC, consistent wording
(always use huge pages) as well as renamed variables.

Should I create a new commit fest entry for this and delete the old
one? Or should this be done in two patches? Locally in my repo this is
done with two commits, so it would be easy to split that.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index cf11306..77c778f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1166,35 +1166,33 @@ include 'filename'
   /listitem
  /varlistentry
 
- varlistentry id=guc-huge-tlb-pages xreflabel=huge_tlb_pages
-  termvarnamehuge_tlb_pages/varname (typeenum/type)/term
+ varlistentry id=guc-huge-pages xreflabel=huge_pages
+  termvarnamehuge_pages/varname (typeenum/type)/term
   indexterm
-   primaryvarnamehuge_tlb_pages/ configuration parameter/primary
+   primaryvarnamehuge_pages/ configuration parameter/primary
   /indexterm
   listitem
para
-Enables/disables the use of huge TLB pages. Valid values are
+Enables/disables the use of huge pages. Valid values are
 literaltry/literal (the default), literalon/literal,
 and literaloff/literal.
/para
 
para
-At present, this feature is supported only on Linux. The setting
-is ignored on other systems.
+At present, this feature is supported only on Linux. The setting is
+ignored on other systems when set to literaltry/literal.
+productnamePostgreSQL/productname will
+refuse to start when set to literalon/literal.
/para
 
para
-The use of huge TLB pages results in smaller page tables and
-less CPU time spent on memory management, increasing performance. For
-more details, see
-ulink url=https://wiki.debian.org/Hugepages;the Debian wiki/ulink.
-Remember that you will need at least shared_buffers / huge page size +
-1 huge TLB pages. So for example for a system with 6GB shared buffers
-and a hugepage size of 2kb of you will need at least 3156 huge pages.
+The use of huge pages results in smaller page tables and less CPU time
+spent on memory management, increasing performance. For more details,
+see xref linkend=linux-huge-pages.
/para
 
para
-With varnamehuge_tlb_pages/varname set to literaltry/literal,
+With varnamehuge_pages/varname set to literaltry/literal,
 the server will try to use huge pages, but fall back to using
 normal allocation if that fails. With literalon/literal, failure
 to use huge pages will prevent the server from starting up. With
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..7f4a235 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,57 @@ echo -1000  /proc/self/oom_score_adj
/para
/note
   /sect2
+
+  sect2 id=linux-huge-pages
+   titleLinux huge pages/title
+
+   para
+Using huge pages reduces overhead when using large contiguous chunks of
+memory, like productnamePostgreSQL/productname does. To enable this
+feature in productnamePostgreSQL/productname you need a kernel
+with varnameCONFIG_HUGETLBFS=y/varname and
+varnameCONFIG_HUGETLB_PAGE=y/varname. You also have to tune the system
+setting varnamevm.nr_hugepages/varname. To estimate the number of
+necessary huge pages start productnamePostgreSQL/productname without
+huge pages enabled and check the varnameVmPeak/varname value from the
+proc filesystem:
+programlisting
+$ userinputhead -1 /path/to/data/directory/postmaster.pid/userinput
+4170
+$ userinputgrep ^VmPeak /proc/4170/status/userinput
+VmPeak:  6490428 kB
+/programlisting
+ literal6490428/literal / literal2048/literal
+ (varnamePAGE_SIZE/varname is literal2MB/literal in this case) are
+ roughly literal3169.154/literal huge pages, so you will need at
+ least literal3170/literal huge pages:
+programlisting
+$ userinputsysctl -w vm.nr_hugepages=3170/userinput
+/programlisting
+Sometimes the kernel is not able to allocate the desired number of huge
+pages, so it might be necessary

Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-26 Thread Christian Kruse
Hi Peter,

thank you for your nice words, much appreciated. I'm sorry that I was
so whiny about this in the last post.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpxVj8SJRDQS.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-25 Thread Christian Kruse
Hi,

On 25/02/14 10:29, Peter Eisentraut wrote:
 I don't think we should be explaining the basics of OS memory management
 in our documentation.

Well, I'm confused. I thought that's exactly what has been asked.

 And if we did, we shouldn't copy it verbatim from the Debian wiki
 without attribution.

I didn't. This is a write-up of several articles, blog posts and
documentation I read about this topic.

However, if you think the texts are too similar, then we should add a
note, yes. Didn't mean to copy w/o referring to a source.

 I think this patch should be cut down to the paragraphs that cover the
 actual configuration.

I tried to cover the issues Heikki brought up in
52861eec.2090...@vmware.com.

 On a technical note, use xref instead of link for linking.
 doc/src/sgml/README.links contains some information.

OK, I will post an updated patch later this evening.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpfzdmiy8T7D.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-02-25 Thread Christian Kruse
Hi,

On 25/02/14 17:01, Andres Freund wrote:
  And if we did, we shouldn't copy it verbatim from the Debian wiki
  without attribution.
 
 Is it actually? A quick comparison doesn't show that many similarities?
 Christian?

Not as far as I know. But of course, as I wrote the text I _also_
(that's not my only source) read the Debian article and I was
influenced by it. It may be that the texts are more similar then I
thought, although I still don't see it.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgplDAVez7E7u.pgp
Description: PGP signature


Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-02-24 Thread Christian Kruse
Hi,

attached you will find a new version of the patch, removing the ctid
text but leaving the ctid itself in the message.

On 23/02/14 11:14, Amit Kapila wrote:
  In general, why I am suggesting to restrict display of newly added
  context for the case it is added to ensure that it doesn't get started
  displaying in other cases where it might make sense or might not
  make sense.
 
  Anything failing while inside an XactLockTableWait() et al. will benefit
  from the context. In which scenarios could that be unneccessary?
 
 This path is quite deep, so I have not verified that whether
 this new context can make sense for all error cases.
 For example, in below path (error message), it doesn't seem to
 make any sense (I have tried to generate it through debugger,
 actual message will vary).
 
 XactLockTableWait-SubTransGetParent-SimpleLruReadPage_ReadOnly-SimpleLruReadPage-SlruReportIOError
 
 ERROR:  could not access status of transaction 927
 DETAIL:  Could not open file pg_subtrans/: No error.
 CONTEXT:  relation public.t1
 tuple ctid (0,2)

To be honest, I don't like the idea of setting up this error context
only for log_lock_wait messages. This sounds unnecessary complex to me
and I think that in the few cases where this message doesn't add a
value (and thus is useless) don't justify such complexity.

 I have not checked that, but the reason I mentioned about database name
 is due to display of database oid in similar message, please see the
 message below. I think in below context we get it from lock tag and
 I think for the patch case also, it might be better to display, but not
 a mandatory thing. Consider it as a suggestion which if you also feel
 good, then do it, else ignore it.
 
 LOG:  process 4656 still waiting for ExclusiveLock on tuple (0,1) of relation 
 57
 513 of database 12045 after 1046.000 ms

After thinking a bit about this I added the database name to the
context message, see attached patch.

   Currently I simply display the whole tuple with truncating long
   fields. This is plain easy, no distinction necessary. I did some code
   reading and it seems somewhat complex to get the PK columns and it
   seems that we need another lock for this, too - maybe not the best
   idea when we are already in locking trouble, do you disagree?
 
  I don't think you need to take another lock for this, it must already
  have appropriate lock by that time. There should not be any problem
  in calling relationHasPrimaryKey except that currently it is static, you
  need to expose it.
 
  Other locations that deal with similar things (notably
  ExecBuildSlotValueDescription()) doesn't either. I don't think this
  patch should introduce it then.
 
 Displaying whole tuple doesn't seem to be the most right way
 to get debug information and especially in this case we are
 already displaying tuple offset(ctid) which is unique identity
 to identify a tuple. It seems to me that it is sufficient to display
 unique value of tuple if present.
 I understand that there is no clear issue here, so may be if others also
 share their opinion then it will be quite easy to take a call.

That would be nice. I didn't change it, yet.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index a771ccb..a04414e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -107,6 +107,8 @@ static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
 		uint16 t_infomask);
 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 int *remaining, uint16 infomask);
+static void MultiXactIdWaitWithErrorContext(Relation rel, HeapTuple tup, MultiXactId multi,
+	MultiXactStatus status, int *remaining, uint16 infomask);
 static bool ConditionalMultiXactIdWait(MultiXactId multi,
 		   MultiXactStatus status, int *remaining,
 		   uint16 infomask);
@@ -2703,8 +2705,8 @@ l1:
 		if (infomask  HEAP_XMAX_IS_MULTI)
 		{
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
-			NULL, infomask);
+			MultiXactIdWaitWithErrorContext(relation, tp, (MultiXactId) xwait,
+	  MultiXactStatusUpdate, NULL, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2730,7 +2732,7 @@ l1:
 		else
 		{
 			/* wait for regular transaction to end */
-			XactLockTableWait(xwait);
+			XactLockTableWaitWithInfo(relation, tp, xwait);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3255,8 +3257,8 @@ l2:
 			int			remain;
 
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait, mxact_status, remain,
-			infomask);
+			MultiXactIdWaitWithErrorContext(relation, oldtup,
+	   (MultiXactId) xwait, mxact_status, remain, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3330,7 +3332,7 @@ l2:
 			else
 			{
 /* wait

Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-02-21 Thread Christian Kruse
Hi,

On 19.02.2014 08:11, Amit Kapila wrote:
 I have looked into this patch. Below are some points which I
 think should be improved in this patch:

Thank you for your review.

 1. New context added by this patch is display at wrong place
 […]
 Do this patch expect to print the context at cancel request
 as well?
 I don't find it useful. I think the reason is that after setup of
 context, if any other error happens, it will display the newly setup
 context.

To be honest, I don't see a problem here. If you cancel the request in
most cases it is because it doesn't finish in an acceptable time. So
displaying the context seems logical to me.

 2a. About the message:
 LOG:  process 1936 still waiting for ShareLock on transaction 842
 after 1014.000ms
 CONTEXT:  relation name: foo (OID 57496)
 tuple (ctid (0,1)): (1, 2, abc  )
 
 Isn't it better to display database name as well, as if we see in
 one of related messages as below, it displays database name as well.
 
 LOG:  process 3012 still waiting for ExclusiveLock on tuple (0,1) of
 relation 57
 499 of database 12045 after 1014.000 ms

Maybe. But then we either have duplicate infos in some cases (e.g. on
a table lock) or we have to distinguish error cases and show distinct
contextes. And also getting the database name from a relation is not
really straight forward (according to Andres we would have to look at
rel-rd_node.dbNode) and since other commands dealing with names don't
do so, either, I think we should leave out the database name.

 2b. I think there is no need to display *ctid* before tuple offset, it seems
 to be obvious as well as consistent with other similar message.

Ok, I'll drop it.

 3. About callback I think rather than calling setup/tear down
 functions from multiple places, it will be better to have wrapper
 functions for XactTableLockWait() and MultiXactIdWait(). Just check
 in plpgsql_exec_function(), how the errorcallback is set, can we do
 something similar without to avoid palloc?

OK, Attached.

 4. About the values of tuple
 I think it might be useful to display some unique part of tuple for
 debugging part, but what will we do incase there is no primary
 key on table, so may be we can display initial few columns (2 or 3)
 or just display tuple offset as is done in other similar message
 shown above.

Currently I simply display the whole tuple with truncating long
fields. This is plain easy, no distinction necessary. I did some code
reading and it seems somewhat complex to get the PK columns and it
seems that we need another lock for this, too – maybe not the best
idea when we are already in locking trouble, do you disagree?

Also showing just a few columns when no PK is available might not be
helpful since the tuple is not necessarily identified by the columns
showed. And since we drop the CTID there is no alternative solution to
identify the tuple.

IMHO simply displaying the whole tuple is the best solution in this
case, do you agree?

 More to the point, I have observed that in few cases
 displaying values of tuple can be confusing as well. For Example:
 […]
 Now here it will be bit confusing to display values with tuple, as
 in session-3 statement has asked to update value (3), but we have
 started waiting for value (4). Although it is right, but might not
 make much sense.

What do you suggest to avoid confusion? I can see what you are talking
about, but I'm not sure what to do about it. Keep in mind that this
patch should help debugging locking, so IMHO it is essential to be
able to identify a tuple and therefore knowing its values.

 Also after session-2 commits the transaction, session-3 will say
 acquired lock, however still it will not be able to update it.

To be honest, I don't think that this is a problem of the
patch. Concurrency is not easy and it might lead to confusing
situations. I don't think that we should drop the tuple values because
of this, since they provide useful and precious debugging information.

Attached you will find a new version of the patch, mainly using
wrapper functions for XactLockTableWait() and MultiXactIdWait().

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index a771ccb..71aaa93 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -107,6 +107,8 @@ static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
 		uint16 t_infomask);
 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 int *remaining, uint16 infomask);
+static void MultiXactIdWaitWithErrorContext(Relation rel, HeapTuple tup, MultiXactId multi,
+	MultiXactStatus status, int *remaining, uint16 infomask);
 static bool ConditionalMultiXactIdWait(MultiXactId multi,
 		   MultiXactStatus status, int *remaining,
 		   uint16 infomask);
@@ -2703,8 +2705,8 @@ l1:
 		if (infomask

Re: [HACKERS] Storing the password in .pgpass file in an encrypted format

2014-02-21 Thread Christian Kruse
Hi,

On 21/02/14 11:15, Alvaro Herrera wrote:
 Maybe you can memfrob() the password to encrypt it before writing, and
 then memfrob() it back before applying it.  Would that be secure?

From `man memfrob`:

 Note that this function is not a proper encryption routine as the XOR
 constant is fixed, and is only suitable for hiding strings.

No, it is not secure. And I agree, encrypting .pgpass doesn't make
sense. Either you have a known key and then encryption is useless or
you have to provide a key at runtime and then .pgpass is useless.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpVNLWTO24xl.pgp
Description: PGP signature


Re: [HACKERS] Patch: show xid and xmin in pg_stat_activity and pg_stat_replication

2014-02-20 Thread Christian Kruse
Hi,

On 18.02.2014 22:02, Andres Freund wrote:
 Not really sure which way is better.

One dev against it, one dev not sure. Enough for me to change it :)

Will post a new patch this evening.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services




signature.asc
Description: OpenPGP digital signature


Re: [HACKERS] Patch: show xid and xmin in pg_stat_activity and pg_stat_replication

2014-02-17 Thread Christian Kruse
Hi Robert,

Am 15.02.14 05:03, schrieb Robert Haas:
 Well, this version of the patch reveals a mighty interesting point: a
 lot of the people who are calling pgstat_fetch_stat_beentry() don't
 need this additional information and might prefer not to pay the cost
 of fetching it.

Well, the cost is already paid due to the fact that this patch uses
LocalPgBackendStatus instead of PgBackendStatus in
pgstat_read_current_status(). And pgstat_fetch_stat_beentry() returns
a pointer instead of a copy, so the cost is rather small, too.

 None of pg_stat_get_backend_pid,
 pg_stat_get_backend_dbid, pg_stat_get_backend_userid,
 pg_stat_get_backend_activity, pg_stat_get_backend_activity,
 pg_stat_get_backend_waiting, pg_stat_get_backend_activity_start,
 pg_stat_get_backend_xact_start, pg_stat_get_backend_start,
 pg_stat_get_backend_client_addr, pg_stat_get_backend_client_port,
 pg_stat_get_backend_client_port, and pg_stat_get_db_numbackends
 actually need this new information; it's only ever used in one place.
 So it seems like it might be wise to have pgstat_fetch_stat_beentry
 continue to return the PgBackendStatus * and add a new function
 pgstat_fetch_stat_local_beentry to fetch the LocalPgBackendStatus *;
 then most of these call sites wouldn't need to change.

This is true for now. But one of the purposes of using
LocalPgBackendStatus instead of PgBackendStatus was to be able to add
more fields like this in future. And thus we might need to change this
in future, so why not do it now?

And I also agree to Andres.

 It would still be the case that pgstat_read_current_status() pays the
 price of fetching this information even if pg_stat_get_activity is
 never called.  But since that's probably by far the most commonly-used
 API for this information, that's probably OK.

I agree.

I will change it if this is really wanted, but I think it would be a
good idea to do it this way.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services




signature.asc
Description: OpenPGP digital signature


Re: [HACKERS] Patch: compiling the docs under Gentoo

2014-02-12 Thread Christian Kruse
Hi,

On Tuesday 11 February 2014 16:04:30 Peter Eisentraut wrote:
 On 1/30/14, 2:42 AM, Christian Kruse wrote:
  +Since Gentoo often supports different versions of a package to be
  +installed you have to tell the PostgreSQL build environment where the
  +Docbook DTD is located:
  +programlisting
  +cd /path/to/postgresql/sources/doc
  +make DOCBOOKSTYLE=/usr/share/sgml/docbook/sgml-dtd-4.2
  +/programlisting
 
 This is wrong.

To be honest I noticed a few days ago that this is unnecessary. Just
installing the right packages already solved the problem, it was a
fallacy that setting DOCBOOKSTYLE did help.

I just didn't have had the time to send a new version of the patch,
yet…

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



signature.asc
Description: This is a digitally signed message part.


Re: [HACKERS] extension_control_path

2014-02-07 Thread Christian Kruse
Hi,

On 06/02/14 18:14, Greg Stark wrote:
 Installing into /usr/local is a global system change. Only root should
 be able to do that and any user that can do that can easily acquire
 root privileges.

The idea behind Homebrew is copied from FreeBSD, where you also
install 3rd party software to /usr/local. This is felt as cleaner and
nicer by these guys. Homebrew goes one step further: with Homebrew you
are able to completely remove all 3rd party software installed via
Homebrew as well as Homebrew itself by simply removing /usr/local.

And since most of the time OS X is used as a desktop software, they
simplified things for users by chown-ing /usr/local (which, in a clean
OS X installation, is either empty or does not exist, depending on the
version) at installation time to the user installing Homebrew.

Of course you can avoid this by installing Homebrew as root, but using
the root user is not very popular in OS X land.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpZntl0SyG93.pgp
Description: PGP signature


Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-02-04 Thread Christian Kruse
Hi,

On 04/02/14 12:38, Fujii Masao wrote:
 ISTM that the phrase Request queue is not used much around the lock.
 Using the phrase wait queue or Simon's suggestion sound better to at least 
 me.
 Thought?

Sounds reasonable to me. Attached patch changes messages to the following:

Process holding the lock: A. Wait queue: B.
Processes holding the lock: A, B. Wait queue: C.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/sources.sgml b/doc/src/sgml/sources.sgml
index 881b0c3..aa20807 100644
--- a/doc/src/sgml/sources.sgml
+++ b/doc/src/sgml/sources.sgml
@@ -251,6 +251,15 @@ ereport(ERROR,
/listitem
listitem
 para
+ functionerrdetail_log_plural(const char *fmt_singuar, const char
+ *fmt_plural, unsigned long n, ...)/function is like
+ functionerrdetail_log/, but with support for various plural forms of
+ the message.
+ For more information see xref linkend=nls-guidelines.
+/para
+   /listitem
+   listitem
+para
  functionerrhint(const char *msg, ...)/function supplies an optional
  quotehint/ message; this is to be used when offering suggestions
  about how to fix the problem, as opposed to factual details about
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index fb449a8..9620f6e 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1209,13 +1209,23 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		lock_waiters_sbuf,
+		lock_holders_sbuf;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+		first_waiter = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(lock_waiters_sbuf);
+			initStringInfo(lock_holders_sbuf);
+
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1225,10 +1235,67 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock-procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+/*
+ * we are a waiter if myProc-waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+if (proclock-tag.myProc-waitProcLock == proclock)
+{
+	if (first_waiter)
+	{
+		appendStringInfo(lock_waiters_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_waiter = false;
+	}
+	else
+		appendStringInfo(lock_waiters_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+}
+else
+{
+	if (first_holder)
+	{
+		appendStringInfo(lock_holders_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_holder = false;
+	}
+	else
+		appendStringInfo(lock_holders_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(Process holding the lock: %s. Wait queue: %s.,
+		   Processes holding the lock: %s. Wait queue: %s.,
+			   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			else if (deadlock_state == DS_HARD_DEADLOCK)
 			{
 /*
@@ -1240,13 +1307,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  */
 ereport(LOG,
 		(errmsg(process %d detected deadlock while waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(Process holding the lock: %s. Wait queue: %s.,
+		   Processes holding the lock: %s. Wait queue: %s.,
+			   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			}
 
 			if (myWaitStatus == STATUS_WAITING)
 ereport(LOG,
 		(errmsg(process %d still waiting

Re: [HACKERS] nested hstore - large insert crashes server

2014-02-04 Thread Christian Kruse
Hi,

On 04/02/14 17:41, Erik Rijkers wrote:

 2014-02-04 10:34:25.376 CET 29133 LOG:  server process (PID 29459) was 
 terminated by signal 9: Killed

Did you check if this was the OOM killer? Should be logged in dmesg.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpfuIWxnh8kv.pgp
Description: PGP signature


Re: [HACKERS] Wait free LW_SHARED acquisition

2014-02-04 Thread Christian Kruse
Hi,

I'm doing some benchmarks regarding this problem: one set with
baseline and one set with your patch. Machine was a 32 core machine (4
CPUs with 8 cores), 252 gib RAM. Both versions have the type align
patch applied. pgbench-tools config:

SCALES=100
SETCLIENTS=1 4 8 16 32 48 64 96 128
SETTIMES=2

I added -M prepared to the pgbench call in the benchwarmer script.

The read-only tests are finished, I come to similiar results as yours:

http://wwwtech.de/pg/benchmarks-lwlock-read-only/

I think the small differences are caused by the fact that I use TCP
connections and not Unix domain sockets.

The results are pretty impressive… I will post the read-write results
as soon as they are finished.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpKL7vRo_Vp0.pgp
Description: PGP signature


Re: [HACKERS] Wait free LW_SHARED acquisition

2014-02-04 Thread Christian Kruse
Hi,

On 04/02/14 12:02, Peter Geoghegan wrote:
 On Tue, Feb 4, 2014 at 11:39 AM, Christian Kruse
 christ...@2ndquadrant.com wrote:
  I'm doing some benchmarks regarding this problem: one set with
  baseline and one set with your patch. Machine was a 32 core machine (4
  CPUs with 8 cores), 252 gib RAM.
 
 What CPU model? Can you post /proc/cpuinfo? The distinction between
 logical and physical cores matters here.

model name  : Intel(R) Xeon(R) CPU E5-4620 0 @ 2.20GHz

32 physical cores, 64 logical cores. /proc/cpuinfo is applied.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpZpWnKfQtb4.pgp
Description: PGP signature


Re: [HACKERS] Wait free LW_SHARED acquisition

2014-02-04 Thread Christian Kruse
Hi,

On 04/02/14 21:03, Andres Freund wrote:
 Christian, could you rerun with master (the commit on which the
 branch is based on), the alignment patch, and then the lwlock patch?
 Best with max_connections 200.  That's probably more important than
 the write tests as a first step..

Ok, benchmark for baseline+alignment patch is running. This will take
a couple of hours and since I have to get up at about 05:00 I won't be
able to post it before tomorrow.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpsN5kcUiOgQ.pgp
Description: PGP signature


Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-02-03 Thread Christian Kruse
Hi,

On 03/02/14 17:59, Fujii Masao wrote:
 Since you added errdetail_log_plural(), ISTM that you need to update
 sources.sgml.

[x] Fixed.

  While I'm griping, this message isn't even trying to follow the project's
  message style guidelines.  Detail or context messages are supposed to be
  complete sentence(s), with capitalization and punctuation to match.
 
  Hm, I hope I fixed it in this version of the patch.
 
 Current message doesn't look like complete sentence yet... We would
 need to use something like Processes X, Y are holding while Z is waiting
 for the lock.. I could not come up with good message, though..

The problem is that we have two potential plural cases in this
message. That leads to the need to formulate the second part
independently from singular/plural. I tried to improve a little bit
and propose this message:

Singular:
The following process is holding the lock: A. The request queue
consists of: B.

Plural:
Following processes are holding the lock: A, B. The request queue
consists of: C.

Attached you will find an updated patch.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/sources.sgml b/doc/src/sgml/sources.sgml
index 881b0c3..aa20807 100644
--- a/doc/src/sgml/sources.sgml
+++ b/doc/src/sgml/sources.sgml
@@ -251,6 +251,15 @@ ereport(ERROR,
/listitem
listitem
 para
+ functionerrdetail_log_plural(const char *fmt_singuar, const char
+ *fmt_plural, unsigned long n, ...)/function is like
+ functionerrdetail_log/, but with support for various plural forms of
+ the message.
+ For more information see xref linkend=nls-guidelines.
+/para
+   /listitem
+   listitem
+para
  functionerrhint(const char *msg, ...)/function supplies an optional
  quotehint/ message; this is to be used when offering suggestions
  about how to fix the problem, as opposed to factual details about
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index fb449a8..da72c82 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1209,13 +1209,23 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		lock_waiters_sbuf,
+		lock_holders_sbuf;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+		first_waiter = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(lock_waiters_sbuf);
+			initStringInfo(lock_holders_sbuf);
+
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1225,10 +1235,67 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock-procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+/*
+ * we are a waiter if myProc-waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+if (proclock-tag.myProc-waitProcLock == proclock)
+{
+	if (first_waiter)
+	{
+		appendStringInfo(lock_waiters_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_waiter = false;
+	}
+	else
+		appendStringInfo(lock_waiters_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+}
+else
+{
+	if (first_holder)
+	{
+		appendStringInfo(lock_holders_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_holder = false;
+	}
+	else
+		appendStringInfo(lock_holders_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(The following process is holding the lock: %s. The request queue consists of: %s.,
+			   Following processes are holding the lock: %s. The request queue consists of: %s

Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-02-03 Thread Christian Kruse
Hi Simon,

On 03/02/14 10:43, Simon Riggs wrote:
  Singular:
  The following process is holding the lock: A. The request queue
  consists of: B.
 
  Plural:
  Following processes are holding the lock: A, B. The request queue
  consists of: C.
 
 Seems too complex. How about this...
 
 Lock holder(s): A. Lock waiter(s) B
 Lock holder(s): A, B. Lock waiter(s) C

This is basically the same as before, it is even shorter. The
complaint was that I don't use a whole sentence in this error
detail. Won't the change fulfill the same complaint?

To be honest, I'd like to stick with your earlier proposal:

Singular:
Process holding the lock: A. Request queue: B

Plural:
Processes holding the lock: A, B. Request queue: C, D

This seems to be a good trade-off between project guidelines,
readability and parsability.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpq4w0aipTXc.pgp
Description: PGP signature


Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-02-01 Thread Christian Kruse
Hi,

On 01/02/14 02:45, Fujii Masao wrote:

 LOG:  process 33662 still waiting for ShareLock on transaction
 1011 after 1000.184 ms
 DETAIL:  Process holding the lock: 33660. Request queue: 33662.
 [… snip …]
 LOG:  process 33665 still waiting for ExclusiveLock on tuple (0,4)
 of relation 16384 of database 12310 after 1000.134 ms
 DETAIL:  Process holding the lock: 33662. Request queue: 33665
 
 This log message says that the process 33662 is holding the lock, but
 it's not true.

As the message says: first lock is waiting for the transaction, second
one for the tuple. So that are two different locks thus the two
different holders and queues. So…

 Is this the intentional behavior?

Yes, I think so.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpr14uoFS4_6.pgp
Description: PGP signature


Re: [HACKERS] [bug fix] pg_ctl fails with config-only directory

2014-02-01 Thread Christian Kruse
Hi,

On 31/01/14 22:17, MauMau wrote:
 Thanks for reviewing the patch.  Fixed.  I'll add this revised patch to the
 CommitFest entry soon.

Looks fine for me. Set it to „waiting for commit.“

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpLWrb04lc6J.pgp
Description: PGP signature


Re: [HACKERS] Patch: show xid and xmin in pg_stat_activity and pg_stat_replication

2014-02-01 Thread Christian Kruse
Hi,

On 31/01/14 11:24, Robert Haas wrote:
  what do you think about the approach the attached patch implements?
  I'm not really sure if this is what you had in mind, especially if
  this is the right lock.
 
 The attached patch seems not to be attached, […]

*sighs*
I'm at FOSDEM right now, I will send it as soon as I'm back home.

 […] but the right lock to use would be the same one
 BackendIdGetProc() is using.  I'd add a new function
 BackendIdGetTransactionIds or something like that.

Good – that's exactly what I did (with a slightly different naming).

  I also note that the docs seem to need some copy-editing:
 
  + entryThe current xref linked=ddl-system-columnsxmin
  value./xref/entry
 
 The link shouldn't include the period, and probably should also not
 include the word value.  I would make only the word xmin part of
 the link.

Thanks for elaboration.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpkstlLLohOH.pgp
Description: PGP signature


Re: [HACKERS] Patch: show xid and xmin in pg_stat_activity and pg_stat_replication

2014-01-31 Thread Christian Kruse
Hi,

 I suspect we should have a new accessor function that takes a backend
 ID and copies the xid and xmin to pointers provided by the client
 while holding the lock.

what do you think about the approach the attached patch implements?
I'm not really sure if this is what you had in mind, especially if
this is the right lock.

 I also note that the docs seem to need some copy-editing:
 
 + entryThe current xref linked=ddl-system-columnsxmin
 value./xref/entry

Can you elaborate?

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgp70zYmW2633.pgp
Description: PGP signature


Re: [HACKERS] [bug fix] pg_ctl fails with config-only directory

2014-01-30 Thread Christian Kruse
Hi,

personally I really dislike constructs like you used:

#ifdef WIN32
if (check_if_admin)
#endif
check_root(progname);

It is hard to read and may confuse editors. Can you rewrite it?

The rest looks fine to me.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgp56dqGkFa4q.pgp
Description: PGP signature


Re: [HACKERS] Regression tests failing if not launched on db regression

2014-01-30 Thread Christian Kruse
Hi,

 For the documentation patch, I propose the attached to avoid future
 confusions. Comments? It might make sense to back-patch as well.

Compiles, didn't find any typos and I think it is comprehensible.

Looks fine for me.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpCD1puVcmJQ.pgp
Description: PGP signature


Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-30 Thread Christian Kruse
Hi,

On 30/01/14 10:15, Andres Freund wrote:
  While I understand most modifications I'm a little bit confused by
  some parts. Have a look at for example this one:
  
  +   *errstr = psprintf(_(failed to look up effective user id %ld: %s),
  +  (long) user_id,
  +errno ? strerror(errno) : _(user does not 
  exist));
  
  Why is it safe here to use errno? It is possible that the _() function
  changes errno, isn't it?
 
 But the evaluation order is strictly defined here, no? First the boolean
 check for errno, then *either* strerror(errno), *or* the _().

Have a look at the psprintf() call: we first have a _(failed to look
up effective user id %ld: %s) as an argument, then we have a (long)
user_id and after that we have a ternary expression using errno. Isn't
it possible that the first _() changes errno?

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpYj53H5GFar.pgp
Description: PGP signature


Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-30 Thread Christian Kruse
Hi,

On 30/01/14 10:01, Tom Lane wrote:
 While I haven't actually read the gettext docs, I'm pretty sure that
 gettext() is defined to preserve errno.  It's supposed to be something
 that you can drop into existing printf's without thinking, and if
 it mangled errno that would certainly not be the case.

Thanks for your explanation. I verified reading the man page and it
explicitly says:

ERRORS
   errno is not modified.


Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpHCNemua8zx.pgp
Description: PGP signature


Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-29 Thread Christian Kruse
Hi,

On 28/01/14 22:35, Tom Lane wrote:
  Absolutely.  Probably best to save errno into a local just before the
  ereport.
 
  I think just resetting to edata-saved_errno is better and sufficient?
 
 -1 --- that field is nobody's business except elog.c's.

Ok, so I propose the attached patch as a fix.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 8705586..f40215a 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -715,6 +715,7 @@ errcode_for_socket_access(void)
 	{ \
 		char		   *fmtbuf; \
 		StringInfoData	buf; \
+		int saved_errno = errno; \
 		/* Internationalize the error format string */ \
 		if (translateit  !in_error_recursion_trouble()) \
 			fmt = dgettext((domain), fmt);  \
@@ -744,6 +745,7 @@ errcode_for_socket_access(void)
 			pfree(edata-targetfield); \
 		edata-targetfield = pstrdup(buf.data); \
 		pfree(buf.data); \
+		errno = saved_errno; \
 	}
 
 /*
@@ -756,6 +758,7 @@ errcode_for_socket_access(void)
 		const char	   *fmt; \
 		char		   *fmtbuf; \
 		StringInfoData	buf; \
+		int saved_errno = errno; \
 		/* Internationalize the error format string */ \
 		if (!in_error_recursion_trouble()) \
 			fmt = dngettext((domain), fmt_singular, fmt_plural, n); \
@@ -787,6 +790,7 @@ errcode_for_socket_access(void)
 			pfree(edata-targetfield); \
 		edata-targetfield = pstrdup(buf.data); \
 		pfree(buf.data); \
+		errno = saved_errno; \
 	}
 
 


pgpnd3GqyaztU.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-29 Thread Christian Kruse
Hi,

On 29/01/14 14:12, Heikki Linnakangas wrote:
 The documentation is still lacking. We should explain somewhere how to set
 nr.hugepages, for example. The Managing Kernel Resources section ought to
 mention setting. Could I ask you to work on that, please?

Of course! Attached you will find a patch for better documentation.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
old mode 100644
new mode 100755
index 1b5f831..68b38f7
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1128,10 +1128,7 @@ include 'filename'
 The use of huge TLB pages results in smaller page tables and
 less CPU time spent on memory management, increasing performance. For
 more details, see
-ulink url=https://wiki.debian.org/Hugepages;the Debian wiki/ulink.
-Remember that you will need at least shared_buffers / huge page size +
-1 huge TLB pages. So for example for a system with 6GB shared buffers
-and a hugepage size of 2kb of you will need at least 3156 huge pages.
+link linkend=linux-huge-tlb-pagesLinux huge TLB pages/link.
/para
 
para
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
old mode 100644
new mode 100755
index bbb808f..2288c7b
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,83 @@ echo -1000  /proc/self/oom_score_adj
/para
/note
   /sect2
+
+  sect2 id=linux-huge-tlb-pages
+   titleLinux huge TLB pages/title
+
+   para
+Nowadays memory address spaces for processes are virtual. This means, when
+a process reserves memory, it gets a virtual memory address which has to
+be translated to a physical memory address by the OS or the CPU. This can
+be done via calculations, but since memory is accessed very often there is
+a cache for that, called Translation Lookaside Buffer,
+short emphasisTLB/emphasis.
+   /para
+
+   para
+When a process reserves memory, this is done in chunks (often
+of literal4kb/literal) named pages. This means if a process requires
+1GB of RAM, it has literal262144/literal (literal1GB/literal
+/ literal4KB/literal) pages and therefore literal262144/literal
+entries for the translation table. Since the TLB has a limited number of
+entries it is obvious that they can't be they can't all be cached, which
+will lead to loss of performance.
+   /para
+
+   para
+One way to tune this is to increase the page size. Most platforms allow
+larger pages, e.g. x86 allows pages of literal2MB/literal. This would
+reduce the number of pages to literal512/literal
+(literal1GB/literal / literal2MB/literal). This reduces the number
+of necessary lookups drastrically.
+   /para
+
+   para
+To enable this feature in productnamePostgreSQL/productname you need a
+kernel with varnameCONFIG_HUGETLBFS=y/varname and
+varnameCONFIG_HUGETLB_PAGE=y/varname. You also have to tune the system
+setting varnamevm.nr_hugepages/varname. To calculate the number of
+necessary huge pages start productnamePostgreSQL/productname without
+huge pages enabled and check the varnameVmPeakvarname value from the
+proc filesystem:
+
+programlisting
+$ userinputhead -1 /path/to/data/directory/postmaster.pid/userinput
+4170
+$ userinputgrep ^VmPeak /proc/4170/status/userinput
+VmPeak:	 6490428 kB
+/programlisting
+ literal6490428/literal / literal2048/literal
+ (varnamePAGE_SIZE/varname literal2MB/literal) are
+ roughly literal3169.154/literal huge pages, so you will need at
+ least literal3170/literal huge pages:
+programlisting
+sysctl -w vm.nr_hugepages=3170
+/programlisting
+Sometimes the kernel is not able to allocate the desired number of huge
+pages, so it might be necessary to repeat that command or to reboot. Don't
+forget to add an entry to filename/etc/sysctl.conf/filename to persist
+this setting through reboots.
+   /para
+
+   para
+The default behavior for huge pages
+in productnamePostgreSQL/productname is to use them when possible and
+to fallback to normal pages when failing. To enforce the use of huge
+pages, you can
+set link linkend=guc-huge-tlb-pagesvarnamehuge_tlb_pages/varname/link
+to literalon/literal. Note that in this
+case productnamePostgreSQL/productname will fail to start if not
+enough huge pages are available.
+   /para
+
+   para
+For a detailed description of the productnameLinux/productname huge
+pages feature have a look
+at ulink url=https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt/ulink.
+   /para
+
+  /sect2
  /sect1
 
 


pgp2cXAeJk4PE.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-29 Thread Christian Kruse
Hi,

On 29/01/14 10:11, Jeff Janes wrote:
 I'm getting this warning now with gcc (GCC) 4.4.7:

Interesting. I don't get that warning. But the compiler is (formally)
right.

 pg_shmem.c: In function 'PGSharedMemoryCreate':
 pg_shmem.c:332: warning: 'allocsize' may be used uninitialized in this
 function
 pg_shmem.c:332: note: 'allocsize' was declared here

Attached patch should fix that.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index f7596bf..c39dfb6 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -329,7 +329,7 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 static void *
 CreateAnonymousSegment(Size *size)
 {
-	Size		allocsize;
+	Size		allocsize = *size;
 	void	   *ptr = MAP_FAILED;
 
 #ifndef MAP_HUGETLB
@@ -358,7 +358,6 @@ CreateAnonymousSegment(Size *size)
 		 */
 		int			hugepagesize = 2 * 1024 * 1024;
 
-		allocsize = *size;
 		if (allocsize % hugepagesize != 0)
 			allocsize += hugepagesize - (allocsize % hugepagesize);
 
@@ -372,7 +371,6 @@ CreateAnonymousSegment(Size *size)
 	if (huge_tlb_pages == HUGE_TLB_OFF ||
 		(huge_tlb_pages == HUGE_TLB_TRY  ptr == MAP_FAILED))
 	{
-		allocsize = *size;
 		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
 	}
 


pgpzNJrOrDmZa.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-29 Thread Christian Kruse
Hi,

On 29/01/14 21:36, Heikki Linnakangas wrote:
 […]
 Fix pushed.

You are right. Thanks. But there is another bug, see

20140128154307.gc24...@defunct.ch

ff. Attached you will find a patch fixing that.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index ac3a9fe..cf590a0 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -380,9 +380,12 @@ CreateAnonymousSegment(Size *size)
 	}
 
 	if (ptr == MAP_FAILED)
+	{
+		int			saved_errno = errno;
+
 		ereport(FATAL,
 (errmsg(could not map anonymous shared memory: %m),
- (errno == ENOMEM) ?
+ (saved_errno == ENOMEM) ?
  errhint(This error usually means that PostgreSQL's request 
 	for a shared memory segment exceeded available memory, 
 	  swap space or huge pages. To reduce the request size 
@@ -390,6 +393,7 @@ CreateAnonymousSegment(Size *size)
 	   memory usage, perhaps by reducing shared_buffers or 
 		 max_connections.,
 		 *size) : 0));
+	}
 
 	*size = allocsize;
 	return ptr;


pgphzz7yu9Gp0.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-29 Thread Christian Kruse
Hi,

On 29/01/14 22:17, Heikki Linnakangas wrote:
 Thanks. There are more cases of that in InternalIpcMemoryCreate, they ought
 to be fixed as well. And should also grep the rest of the codebase for more
 instances of that. And this needs to be back-patched.

I'm way ahead of you ;-) Working on it.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpCS2lzolGJg.pgp
Description: PGP signature


Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-29 Thread Christian Kruse
Hi,

On 29/01/14 13:39, Tom Lane wrote:
 No, what I meant is that the ereport caller needs to save errno, rather
 than assuming that (some subset of) ereport-related subroutines will
 preserve it.
 […]

Your reasoning sounds quite logical to me. Thus I did a

grep -RA 3 ereport src/* | less

and looked for ereport calls with errno in it. I found quite a few,
attached you will find a patch addressing that issue.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index d73e5e8..3705d0b 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -782,10 +782,14 @@ remove_symlink:
 	else
 	{
 		if (unlink(linkloc)  0)
-			ereport(redo ? LOG : (errno == ENOENT ? WARNING : ERROR),
+		{
+			int saved_errno = errno;
+
+			ereport(redo ? LOG : (saved_errno == ENOENT ? WARNING : ERROR),
 	(errcode_for_file_access(),
 	 errmsg(could not remove symbolic link \%s\: %m,
 			linkloc)));
+		}
 	}
 
 	pfree(linkloc_with_version_dir);
diff --git a/src/backend/port/sysv_sema.c b/src/backend/port/sysv_sema.c
index b4825d2..c79c8ad 100644
--- a/src/backend/port/sysv_sema.c
+++ b/src/backend/port/sysv_sema.c
@@ -85,7 +85,8 @@ static void ReleaseSemaphores(int status, Datum arg);
 static IpcSemaphoreId
 InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
 {
-	int			semId;
+	int			semId,
+saved_errno;
 
 	semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection);
 
@@ -107,12 +108,13 @@ InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
 		/*
 		 * Else complain and abort
 		 */
+		saved_errno = errno;
 		ereport(FATAL,
 (errmsg(could not create semaphores: %m),
  errdetail(Failed system call was semget(%lu, %d, 0%o).,
 		   (unsigned long) semKey, numSems,
 		   IPC_CREAT | IPC_EXCL | IPCProtection),
- (errno == ENOSPC) ?
+ (saved_errno == ENOSPC) ?
  errhint(This error does *not* mean that you have run out of disk space.  
 		  It occurs when either the system limit for the maximum number of 
 			 semaphore sets (SEMMNI), or the system wide maximum number of 
@@ -133,13 +135,14 @@ static void
 IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value)
 {
 	union semun semun;
+	int			saved_errno = errno;
 
 	semun.val = value;
 	if (semctl(semId, semNum, SETVAL, semun)  0)
 		ereport(FATAL,
 (errmsg_internal(semctl(%d, %d, SETVAL, %d) failed: %m,
  semId, semNum, value),
- (errno == ERANGE) ?
+ (saved_errno == ERANGE) ?
  errhint(You possibly need to raise your kernel's SEMVMX value to be at least 
   %d.  Look into the PostgreSQL documentation for details.,
 		 value) : 0));
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index ac3a9fe..cb297bb 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -68,6 +68,7 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 {
 	IpcMemoryId shmid;
 	void	   *memAddress;
+	int			saved_errno = 0;
 
 	shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
 
@@ -137,25 +138,26 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 		 * it should be.  SHMMNI violation is ENOSPC, per spec.  Just plain
 		 * not-enough-RAM is ENOMEM.
 		 */
+		saved_errno = errno;
 		ereport(FATAL,
 (errmsg(could not create shared memory segment: %m),
 		  errdetail(Failed system call was shmget(key=%lu, size=%zu, 0%o).,
 	(unsigned long) memKey, size,
 	IPC_CREAT | IPC_EXCL | IPCProtection),
- (errno == EINVAL) ?
+ (saved_errno == EINVAL) ?
  errhint(This error usually means that PostgreSQL's request for a shared memory 
 		 segment exceeded your kernel's SHMMAX parameter, or possibly that 
 		 it is less than 
 		 your kernel's SHMMIN parameter.\n
 		The PostgreSQL documentation contains more information about shared 
 		 memory configuration.) : 0,
- (errno == ENOMEM) ?
+ (saved_errno == ENOMEM) ?
  errhint(This error usually means that PostgreSQL's request for a shared 
 		 memory segment exceeded your kernel's SHMALL parameter.  You might need 
 		 to reconfigure the kernel with larger SHMALL.\n
 		The PostgreSQL documentation contains more information about shared 
 		 memory configuration.) : 0,
- (errno == ENOSPC) ?
+ (saved_errno == ENOSPC) ?
  errhint(This error does *not* mean that you have run out of disk space.  
 		 It occurs either if all available shared memory IDs have been taken, 
 		 in which case you need to raise the SHMMNI parameter in your kernel, 


pgpkJvJgiH340.pgp
Description: PGP signature


Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-29 Thread Christian Kruse
Hi,

On 29/01/14 21:37, Christian Kruse wrote:
 […]
 attached you will find a patch addressing that issue.

Maybe we should include the patch proposed in

20140129195930.gd31...@defunct.ch

and do this as one (slightly bigger) patch. Attached you will find
this alternative version.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index d73e5e8..3705d0b 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -782,10 +782,14 @@ remove_symlink:
 	else
 	{
 		if (unlink(linkloc)  0)
-			ereport(redo ? LOG : (errno == ENOENT ? WARNING : ERROR),
+		{
+			int saved_errno = errno;
+
+			ereport(redo ? LOG : (saved_errno == ENOENT ? WARNING : ERROR),
 	(errcode_for_file_access(),
 	 errmsg(could not remove symbolic link \%s\: %m,
 			linkloc)));
+		}
 	}
 
 	pfree(linkloc_with_version_dir);
diff --git a/src/backend/port/sysv_sema.c b/src/backend/port/sysv_sema.c
index b4825d2..c79c8ad 100644
--- a/src/backend/port/sysv_sema.c
+++ b/src/backend/port/sysv_sema.c
@@ -85,7 +85,8 @@ static void ReleaseSemaphores(int status, Datum arg);
 static IpcSemaphoreId
 InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
 {
-	int			semId;
+	int			semId,
+saved_errno;
 
 	semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection);
 
@@ -107,12 +108,13 @@ InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
 		/*
 		 * Else complain and abort
 		 */
+		saved_errno = errno;
 		ereport(FATAL,
 (errmsg(could not create semaphores: %m),
  errdetail(Failed system call was semget(%lu, %d, 0%o).,
 		   (unsigned long) semKey, numSems,
 		   IPC_CREAT | IPC_EXCL | IPCProtection),
- (errno == ENOSPC) ?
+ (saved_errno == ENOSPC) ?
  errhint(This error does *not* mean that you have run out of disk space.  
 		  It occurs when either the system limit for the maximum number of 
 			 semaphore sets (SEMMNI), or the system wide maximum number of 
@@ -133,13 +135,14 @@ static void
 IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value)
 {
 	union semun semun;
+	int			saved_errno = errno;
 
 	semun.val = value;
 	if (semctl(semId, semNum, SETVAL, semun)  0)
 		ereport(FATAL,
 (errmsg_internal(semctl(%d, %d, SETVAL, %d) failed: %m,
  semId, semNum, value),
- (errno == ERANGE) ?
+ (saved_errno == ERANGE) ?
  errhint(You possibly need to raise your kernel's SEMVMX value to be at least 
   %d.  Look into the PostgreSQL documentation for details.,
 		 value) : 0));
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index ac3a9fe..511be72 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -68,6 +68,7 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 {
 	IpcMemoryId shmid;
 	void	   *memAddress;
+	int			saved_errno = 0;
 
 	shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
 
@@ -137,25 +138,26 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 		 * it should be.  SHMMNI violation is ENOSPC, per spec.  Just plain
 		 * not-enough-RAM is ENOMEM.
 		 */
+		saved_errno = errno;
 		ereport(FATAL,
 (errmsg(could not create shared memory segment: %m),
 		  errdetail(Failed system call was shmget(key=%lu, size=%zu, 0%o).,
 	(unsigned long) memKey, size,
 	IPC_CREAT | IPC_EXCL | IPCProtection),
- (errno == EINVAL) ?
+ (saved_errno == EINVAL) ?
  errhint(This error usually means that PostgreSQL's request for a shared memory 
 		 segment exceeded your kernel's SHMMAX parameter, or possibly that 
 		 it is less than 
 		 your kernel's SHMMIN parameter.\n
 		The PostgreSQL documentation contains more information about shared 
 		 memory configuration.) : 0,
- (errno == ENOMEM) ?
+ (saved_errno == ENOMEM) ?
  errhint(This error usually means that PostgreSQL's request for a shared 
 		 memory segment exceeded your kernel's SHMALL parameter.  You might need 
 		 to reconfigure the kernel with larger SHMALL.\n
 		The PostgreSQL documentation contains more information about shared 
 		 memory configuration.) : 0,
- (errno == ENOSPC) ?
+ (saved_errno == ENOSPC) ?
  errhint(This error does *not* mean that you have run out of disk space.  
 		 It occurs either if all available shared memory IDs have been taken, 
 		 in which case you need to raise the SHMMNI parameter in your kernel, 
@@ -380,9 +382,12 @@ CreateAnonymousSegment(Size *size)
 	}
 
 	if (ptr == MAP_FAILED)
+	{
+		int			saved_errno = errno;
+
 		ereport(FATAL,
 (errmsg(could not map anonymous shared memory: %m),
- (errno == ENOMEM) ?
+ (saved_errno == ENOMEM) ?
  errhint(This error usually means that PostgreSQL's request 
 	for a shared memory segment exceeded available memory, 
 	  swap space

Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-29 Thread Christian Kruse
Hi,

after I finally got documentation compilation working I updated the
patch to be syntactically correct. You will find it attached.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 1b5f831..68b38f7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1128,10 +1128,7 @@ include 'filename'
 The use of huge TLB pages results in smaller page tables and
 less CPU time spent on memory management, increasing performance. For
 more details, see
-ulink url=https://wiki.debian.org/Hugepages;the Debian wiki/ulink.
-Remember that you will need at least shared_buffers / huge page size +
-1 huge TLB pages. So for example for a system with 6GB shared buffers
-and a hugepage size of 2kb of you will need at least 3156 huge pages.
+link linkend=linux-huge-tlb-pagesLinux huge TLB pages/link.
/para
 
para
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..0b98314 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,82 @@ echo -1000  /proc/self/oom_score_adj
/para
/note
   /sect2
+
+  sect2 id=linux-huge-tlb-pages
+   titleLinux huge TLB pages/title
+
+   para
+Nowadays memory address spaces for processes are virtual. This means, when
+a process reserves memory, it gets a virtual memory address which has to
+be translated to a physical memory address by the OS or the CPU. This can
+be done via calculations, but since memory is accessed very often there is
+a cache for that, called Translation Lookaside Buffer,
+short emphasisTLB/emphasis.
+   /para
+
+   para
+When a process reserves memory, this is done in chunks (often
+of literal4kb/literal) named pages. This means if a process requires
+1GB of RAM, it has literal262144/literal (literal1GB/literal
+/ literal4KB/literal) pages and therefore literal262144/literal
+entries for the translation table. Since the TLB has a limited number of
+entries it is obvious that they can't be they can't all be cached, which
+will lead to loss of performance.
+   /para
+
+   para
+One way to tune this is to increase the page size. Most platforms allow
+larger pages, e.g. x86 allows pages of literal2MB/literal. This would
+reduce the number of pages to literal512/literal
+(literal1GB/literal / literal2MB/literal). This reduces the number
+of necessary lookups drastrically.
+   /para
+
+   para
+To enable this feature in productnamePostgreSQL/productname you need a
+kernel with varnameCONFIG_HUGETLBFS=y/varname and
+varnameCONFIG_HUGETLB_PAGE=y/varname. You also have to tune the system
+setting varnamevm.nr_hugepages/varname. To calculate the number of
+necessary huge pages start productnamePostgreSQL/productname without
+huge pages enabled and check the varnameVmPeak/varname value from the
+proc filesystem:
+programlisting
+$ userinputhead -1 /path/to/data/directory/postmaster.pid/userinput
+4170
+$ userinputgrep ^VmPeak /proc/4170/status/userinput
+VmPeak:  6490428 kB
+/programlisting
+ literal6490428/literal / literal2048/literal
+ (varnamePAGE_SIZE/varname literal2MB/literal) are
+ roughly literal3169.154/literal huge pages, so you will need at
+ least literal3170/literal huge pages:
+programlisting
+$ userinputsysctl -w vm.nr_hugepages=3170/userinput
+/programlisting
+Sometimes the kernel is not able to allocate the desired number of huge
+pages, so it might be necessary to repeat that command or to reboot. Don't
+forget to add an entry to filename/etc/sysctl.conf/filename to persist
+this setting through reboots.
+   /para
+
+   para
+The default behavior for huge pages
+in productnamePostgreSQL/productname is to use them when possible and
+to fallback to normal pages when failing. To enforce the use of huge
+pages, you can
+set link linkend=guc-huge-tlb-pagesvarnamehuge_tlb_pages/varname/link
+to literalon/literal. Note that in this
+case productnamePostgreSQL/productname will fail to start if not
+enough huge pages are available.
+   /para
+
+   para
+For a detailed description of the productnameLinux/productname huge
+pages feature have a look
+at ulink url=https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt/ulink.
+   /para
+
+  /sect2
  /sect1
 
 


pgpuCF6bKmrpI.pgp
Description: PGP signature


Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-29 Thread Christian Kruse
Hi Tom,

On 29/01/14 20:06, Tom Lane wrote:
 Christian Kruse christ...@2ndquadrant.com writes:
  Your reasoning sounds quite logical to me. Thus I did a
  grep -RA 3 ereport src/* | less
  and looked for ereport calls with errno in it. I found quite a few,
  attached you will find a patch addressing that issue.
 
 Committed.

Great! Thanks!

 I found a couple of errors in your patch, but I think everything is
 addressed in the patch as committed.

While I understand most modifications I'm a little bit confused by
some parts. Have a look at for example this one:

+   *errstr = psprintf(_(failed to look up effective user id %ld: %s),
+  (long) user_id,
+errno ? strerror(errno) : _(user does not exist));

Why is it safe here to use errno? It is possible that the _() function
changes errno, isn't it?

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpBVsw_nQE9U.pgp
Description: PGP signature


[HACKERS] Patch: compiling the docs under Gentoo

2014-01-29 Thread Christian Kruse
Hi,

as a Gentoo user I had a hard time getting the documentation
compiled. Attached you will find a Patch explaining exactly this: how
to compile the documentation under Gentoo.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/docguide.sgml b/doc/src/sgml/docguide.sgml
index c9c9862..6133205 100644
--- a/doc/src/sgml/docguide.sgml
+++ b/doc/src/sgml/docguide.sgml
@@ -290,6 +290,24 @@ sudo port install docbook-dsssl docbook-sgml-4.2 docbook-xml-4.2 docbook-xsl lib
   /sect2
 
   sect2
+   titleGentoo/title
+
+   para
+When using Gentoo setting up the tool chain is relatively easy:
+programlisting
+sudo emerge -av app-text/openjade app-text/docbook-sgml-dtd:4.2 app-text/docbook-dsssl-stylesheets app-text/docbook-xsl-stylesheets libxslt
+/programlisting
+Since Gentoo often supports different versions of a package to be
+installed you have to tell the PostgreSQL build environment where the
+Docbook DTD is located:
+programlisting
+cd /path/to/postgresql/sources/doc
+make DOCBOOKSTYLE=/usr/share/sgml/docbook/sgml-dtd-4.2
+/programlisting
+   /para
+  /sect2
+
+  sect2
titleManual Installation from Source/title
 
para


pgpHn3ldfM5Gh.pgp
Description: PGP signature


Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-01-28 Thread Christian Kruse
Hi,

On 27/01/14 11:44, Rajeev rastogi wrote:
 I have checked the revised patch. It looks fine to me except one minor code 
 formatting issue.
 In elog.c, two tabs are missing in the definition of function 
 errdetail_log_plural.
 Please run pgindent tool to check the same.

I did, but this reformats various other locations in the file,
too. Nevertheless I now ran pg_indent against it and removed the other
parts. Attached you will find the corrected patch version.

 Also I would like to highlight one behavior here is that process ID of 
 process trying to
 acquire lock is also listed in the list of Request queue. E.g.

   session 1 with process id X: BEGIN; LOCK TABLE foo IN SHARE MODE;
   session 2 with process id Y: BEGIN; LOCK TABLE foo IN EXCLUSIVE MODE;

 On execution of LOCK in session-2, as part of log it will display as:
   DETAIL:  Process holding the lock: X. Request queue: Y.

   Where Y is the process ID of same process, which was trying to acquire 
 lock.

This is on purpose due to the rewording of the Message. In the first
version the PID of the backend was missing.

Thanks for the review!

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index ee6c24c..6c648cf 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1195,13 +1195,23 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		lock_waiters_sbuf,
+		lock_holders_sbuf;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+		first_waiter = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(lock_waiters_sbuf);
+			initStringInfo(lock_holders_sbuf);
+
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1211,10 +1221,67 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock-procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+/*
+ * we are a waiter if myProc-waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+if (proclock-tag.myProc-waitProcLock == proclock)
+{
+	if (first_waiter)
+	{
+		appendStringInfo(lock_waiters_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_waiter = false;
+	}
+	else
+		appendStringInfo(lock_waiters_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+}
+else
+{
+	if (first_holder)
+	{
+		appendStringInfo(lock_holders_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_holder = false;
+	}
+	else
+		appendStringInfo(lock_holders_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(Process holding the lock: %s. Request queue: %s.,
+		Processes holding the lock: %s. Request queue: %s.,
+			   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			else if (deadlock_state == DS_HARD_DEADLOCK)
 			{
 /*
@@ -1226,13 +1293,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  */
 ereport(LOG,
 		(errmsg(process %d detected deadlock while waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(Process holding the lock: %s. Request queue: %s.,
+			Processes holding lock: %s. Request queue: %s.,
+			   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			}
 
 			if (myWaitStatus == STATUS_WAITING)
 ereport(LOG,
 		(errmsg(process %d still waiting for %s on %s after

Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-28 Thread Christian Kruse
Hi,

On 28/01/14 13:51, Heikki Linnakangas wrote:
 Oh darn, I remembered we had already committed this, but clearly not. I'd
 love to still get this into 9.4. The latest patch (hugepages-v5.patch) was
 pretty much ready for commit, except for documentation.

I'm working on it. I ported it to HEAD and currently doing some
benchmarks. Next will be documentation.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpHMKmOD_jGn.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-28 Thread Christian Kruse
Hi,

On 15/11/13 15:17, Heikki Linnakangas wrote:
 I spent some time whacking this around, new patch version attached. I moved
 the mmap() code into a new function, that leaves the PGSharedMemoryCreate
 more readable.

I think there's a bug in this version of the patch. Have a look at
this:

+   if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+   {
[…]
+   ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+  PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
[…]
+   }
+#endif
+
+   if (huge_tlb_pages == HUGE_TLB_OFF || huge_tlb_pages == HUGE_TLB_TRY)
+   {
+   allocsize = *size;
+   ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, 
-1, 0);
+   }

This will lead to a duplicate mmap() if hugepages work and
huge_tlb_pages == HUGE_TLB_TRY, or am I missing something?
I think it should be like this:

if (huge_tlb_pages == HUGE_TLB_OFF ||
(huge_tlb_pages == HUGE_TLB_TRY  ptr == MAP_FAILED))

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpG9E74KaJDV.pgp
Description: PGP signature


[HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-28 Thread Christian Kruse
Hi,

just a word of warning: it seems as if there is compiler bug in clang
regarding the ternary operator when used in ereport(). While working
on a patch I found that this code:

ereport(FATAL,
(errmsg(could not map anonymous shared memory: 
%m),
 (errno == ENOMEM) ?
 errhint(This error usually means that 
PostgreSQL's request 
 for a shared memory segment 
exceeded available memory 
 or swap space. To reduce the 
request size (currently 
 %zu bytes), reduce 
PostgreSQL's shared memory usage, 
 perhaps by reducing 
shared_buffers or 
 max_connections.,
 *size) : 0));

did not emit a errhint when using clang, although errno == ENOMEM was
true. The same code works with gcc. I used the same data dir, so
config was exactly the same, too.

I reported this bug at clang.org:

http://llvm.org/bugs/show_bug.cgi?id=18644

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpA2ZYq2t26Z.pgp
Description: PGP signature


Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-28 Thread Christian Kruse
Hi,

when I remove the errno comparison and use a 1 it works:

ereport(FATAL,
(errmsg(could not map anonymous shared memory: %m),
 1 ?
 errhint(This error usually means that PostgreSQL's request 
 for a shared memory segment exceeded available memory 

 or swap space. To reduce the request size (currently 
 %zu bytes), reduce PostgreSQL's shared memory usage, 
 perhaps by reducing shared_buffers or 
 max_connections.,
 *size) : 0));

Same if I use an if(errno == ENOMEM) instead of the ternary operator.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpjAIs_29NNJ.pgp
Description: PGP signature


Re: [HACKERS] [PATCH] Use MAP_HUGETLB where supported (v3)

2014-01-28 Thread Christian Kruse
Hi,

attached you will find a new version of the patch, ported to HEAD,
fixed the mentioned bug and - hopefully - dealing the the remaining
issues.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 14ed6c7..e7c2559 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1107,6 +1107,43 @@ include 'filename'
   /listitem
  /varlistentry
 
+ varlistentry id=guc-huge-tlb-pages xreflabel=huge_tlb_pages
+  termvarnamehuge_tlb_pages/varname (typeenum/type)/term
+  indexterm
+   primaryvarnamehuge_tlb_pages/ configuration parameter/primary
+  /indexterm
+  listitem
+   para
+Enables/disables the use of huge TLB pages. Valid values are
+literaltry/literal (the default), literalon/literal,
+and literaloff/literal.
+   /para
+
+   para
+At present, this feature is supported only on Linux. The setting
+is ignored on other systems.
+   /para
+
+   para
+The use of huge TLB pages results in smaller page tables and
+less CPU time spent on memory management, increasing performance. For
+more details, see
+ulink url=https://wiki.debian.org/Hugepages;the Debian wiki/ulink.
+Remember that you will need at least shared_buffers / huge page size +
+1 huge TLB pages. So for example for a system with 6GB shared buffers
+and a hugepage size of 2kb of you will need at least 3156 huge pages.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literaltry/literal,
+the server will try to use huge pages, but fall back to using
+normal allocation if that fails. With literalon/literal, failure
+to use huge pages will prevent the server from starting up. With
+literaloff/literal, huge pages will not be used.
+   /para
+  /listitem
+ /varlistentry
+
  varlistentry id=guc-temp-buffers xreflabel=temp_buffers
   termvarnametemp_buffers/varname (typeinteger/type)/term
   indexterm
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 0d01617..b3b87d7 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #include portability/mem.h
 #include storage/ipc.h
 #include storage/pg_shmem.h
+#include utils/guc.h
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -41,7 +42,7 @@ typedef int IpcMemoryId;		/* shared memory ID returned by shmget(2) */
 unsigned long UsedShmemSegID = 0;
 void	   *UsedShmemSegAddr = NULL;
 static Size AnonymousShmemSize;
-static void *AnonymousShmem;
+static void *AnonymousShmem = NULL;
 
 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
@@ -317,6 +318,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 	return true;
 }
 
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the desired size in *size. This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a larger than
+ * desired segment.
+ */
+#ifndef EXEC_BACKEND
+static void *
+CreateAnonymousSegment(Size *size)
+{
+	Size		allocsize;
+	void	   *ptr = MAP_FAILED;
+
+#ifndef MAP_HUGETLB
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg(huge TLB pages not supported on this platform)));
+#else
+	if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+	{
+		/*
+		 * Round up the request size to a suitable large value.
+		 *
+		 * Some Linux kernel versions are known to have a bug, which causes
+		 * mmap() with MAP_HUGETLB to fail if the request size is not a
+		 * multiple of any supported huge page size. To work around that, we
+		 * round up the request size to nearest 2MB. 2MB is the most common
+		 * huge page page size on affected systems.
+		 *
+		 * Aside from that bug, even with a kernel that does the allocation
+		 * correctly, rounding it up ourselvees avoids wasting memory. Without
+		 * it, if we for example make an allocation of 2MB + 1 bytes, the
+		 * kernel might decide to use two 2MB huge pages for that, and waste 2
+		 * MB - 1 of memory. When we do the rounding ourselves, we can use
+		 * that space for allocations.
+		 */
+		int			hugepagesize = 2 * 1024 * 1024;
+
+		allocsize = *size;
+		if (allocsize % hugepagesize != 0)
+			allocsize += hugepagesize - (allocsize % hugepagesize);
+
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+   PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
+		if (huge_tlb_pages == HUGE_TLB_TRY  ptr == MAP_FAILED)
+			elog(DEBUG1, mmap with MAP_HUGETLB failed, huge pages disabled: %m);
+	}
+#endif
+
+	if (huge_tlb_pages == HUGE_TLB_OFF ||
+		(huge_tlb_pages == HUGE_TLB_TRY  ptr == MAP_FAILED

Re: [HACKERS] Suspicion of a compiler bug in clang: using ternary operator in ereport()

2014-01-28 Thread Christian Kruse
Hi,

On 28/01/14 16:43, Christian Kruse wrote:
   ereport(FATAL,
   (errmsg(could not map anonymous shared memory: 
 %m),
(errno == ENOMEM) ?
errhint(This error usually means that 
 PostgreSQL's request 
for a shared memory segment 
 exceeded available memory 
or swap space. To reduce the 
 request size (currently 
%zu bytes), reduce 
 PostgreSQL's shared memory usage, 
perhaps by reducing 
 shared_buffers or 
max_connections.,
*size) : 0));
 
 did not emit a errhint when using clang, although errno == ENOMEM was
 true. The same code works with gcc.

According to http://llvm.org/bugs/show_bug.cgi?id=18644#c5 this is not
a compiler bug but a difference between gcc and clang. Clang seems to
use a left-to-right order of evaluation while gcc uses a right-to-left
order of evaluation. So if errmsg changes errno this would lead to
errno == ENOMEM evaluated to false. I added a watch point on errno and
it turns out that exactly this happens: in src/common/psprintf.c line
114

nprinted = vsnprintf(buf, len, fmt, args);

errno gets set to 0. This means that we will miss errhint/errdetail if
we use errno in a ternary operator and clang.

Should we work on this issue?

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpz5JCRgHOM0.pgp
Description: PGP signature


Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-01-23 Thread Christian Kruse
Hi,

 I think you have attached wrong patch.

Hurm. You are right, attached v3, not v4. Sorry.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index ee6c24c..6c648cf 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1195,13 +1195,23 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		lock_waiters_sbuf,
+		lock_holders_sbuf;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+		first_waiter = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(lock_waiters_sbuf);
+			initStringInfo(lock_holders_sbuf);
+
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1211,10 +1221,67 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock-procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+/*
+ * we are a waiter if myProc-waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+if (proclock-tag.myProc-waitProcLock == proclock)
+{
+	if (first_waiter)
+	{
+		appendStringInfo(lock_waiters_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_waiter = false;
+	}
+	else
+		appendStringInfo(lock_waiters_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+}
+else
+{
+	if (first_holder)
+	{
+		appendStringInfo(lock_holders_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_holder = false;
+	}
+	else
+		appendStringInfo(lock_holders_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(Process holding the lock: %s. Request queue: %s.,
+		Processes holding the lock: %s. Request queue: %s.,
+			   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			else if (deadlock_state == DS_HARD_DEADLOCK)
 			{
 /*
@@ -1226,13 +1293,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  */
 ereport(LOG,
 		(errmsg(process %d detected deadlock while waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(Process holding the lock: %s. Request queue: %s.,
+			Processes holding lock: %s. Request queue: %s.,
+			   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			}
 
 			if (myWaitStatus == STATUS_WAITING)
 ereport(LOG,
 		(errmsg(process %d still waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errdetail_log_plural(Process holding the lock: %s. Request queue: %s.,
+		Processes holding the lock: %s. Request queue: %s.,
+			   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			else if (myWaitStatus == STATUS_OK)
 ereport(LOG,
 	(errmsg(process %d acquired %s on %s after %ld.%03d ms,
@@ -1252,7 +1325,10 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 if (deadlock_state != DS_HARD_DEADLOCK)
 	ereport(LOG,
 			(errmsg(process %d failed to acquire %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+			 (errdetail_log_plural(Process holding the lock: %s. Request queue: %s.,
+		Processes holding the lock: %s. Request queue: %s.,
+   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data

Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-01-22 Thread Christian Kruse
Hi,

On 22/01/14 12:40, Simon Riggs wrote:
  1. I find a issue in following scenario:
 
  session 1 with process id X: BEGIN; LOCK TABLE foo IN SHARE MODE;
  session 2 with process id Y: BEGIN; LOCK TABLE foo IN EXCLUSIVE 
  MODE;
  session 3 with process id Z: BEGIN; LOCK TABLE foo IN SHARE MODE;
 
  On execution of LOCK in session-3, as part of log it will display 
  as:
  processes owning lock: X, Y
  But actually if we see Y has not yet own the lock, it is still 
  waiting with higher priority.
  It may mislead user.
  May be we should change message to give all meaning i.e. which 
  process is owning lock and
  Which process is already in queue.
 
 Perhaps this?
 
 CONTEXT: lock owner  request queue XXX, XXX, XXX, etc

Fixed.

  2. Can we give a better name to new variable 'buf1'?

Fixed.

  3. Do we need to take performance reading to see if any impact?
 
 Don't think so. Diagnosing problems will help performance, not hinder it

I agree. And this code path will only get executed when log_lock_waits
= on, which seems to be a debugging method to me.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index ee6c24c..588c4ef 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1195,13 +1195,23 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		lock_waiters_sbuf,
+		lock_holders_sbuf;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+		first_waiter = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(lock_waiters_sbuf);
+			initStringInfo(lock_holders_sbuf);
+
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1211,10 +1221,58 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+/*
+ * we are a waiter if myProc-waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+if (proclock-tag.myProc-waitProcLock == proclock)
+{
+	if (first_waiter)
+	{
+		appendStringInfo(lock_waiters_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_waiter = false;
+	}
+	else
+		appendStringInfo(lock_waiters_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+}
+else
+{
+	if (first_holder)
+	{
+		appendStringInfo(lock_holders_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_holder = false;
+	}
+	else
+		appendStringInfo(lock_holders_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errcontext(ngettext(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+			  lockHoldersNum), lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			else if (deadlock_state == DS_HARD_DEADLOCK)
 			{
 /*
@@ -1226,13 +1284,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  */
 ereport(LOG,
 		(errmsg(process %d detected deadlock while waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errcontext(ngettext(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+			  lockHoldersNum), lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			}
 
 			if (myWaitStatus == STATUS_WAITING)
 ereport(LOG,
 		(errmsg(process %d still waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errcontext(ngettext(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+			  lockHoldersNum

Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-01-22 Thread Christian Kruse
Hi,

attached you will find a new version of the patch containing more
comments.

On 22/01/14 12:00, Tom Lane wrote:
 Alvaro Herrera alvhe...@2ndquadrant.com writes:
  This ngettext() call is repeated four times in the new code, which is a
  bit annoying because it's not trivial.  I think you could assign the
  ngettext() to a char * at the bottom of the loop, and then in the
  ereport() calls use it:

 Would that not break the compiler's ability to verify the format codes
 in the string?  Not to mention make it harder for people to compare
 format to arguments, too?

I agree.

 However, the real problem here is that you shouldn't be calling ngettext
 manually in an ereport context in the first place.  There is
 infrastructure in place for that, and this isn't using it.

Fixed in attached patch. I changed it from calling
errorcontext(ngettext()) to calling errdetail_plural().

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 122afb2..552c5a4 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1195,13 +1195,23 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		lock_waiters_sbuf,
+		lock_holders_sbuf;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+		first_waiter = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(lock_waiters_sbuf);
+			initStringInfo(lock_holders_sbuf);
+
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1211,10 +1221,67 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock-procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+/*
+ * we are a waiter if myProc-waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+if (proclock-tag.myProc-waitProcLock == proclock)
+{
+	if (first_waiter)
+	{
+		appendStringInfo(lock_waiters_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_waiter = false;
+	}
+	else
+		appendStringInfo(lock_waiters_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+}
+else
+{
+	if (first_holder)
+	{
+		appendStringInfo(lock_holders_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_holder = false;
+	}
+	else
+		appendStringInfo(lock_holders_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+(errdetail_plural(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+  lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			else if (deadlock_state == DS_HARD_DEADLOCK)
 			{
 /*
@@ -1226,13 +1293,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  */
 ereport(LOG,
 		(errmsg(process %d detected deadlock while waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+(errdetail_plural(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+  lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			}
 
 			if (myWaitStatus == STATUS_WAITING)
 ereport(LOG,
 		(errmsg(process %d still waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+(errdetail_plural(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+  lockHoldersNum

Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2014-01-22 Thread Christian Kruse
Hi,

On 22/01/14 14:45, Tom Lane wrote:
 Well, is it context or detail?  Those fields have reasonably well defined
 meanings IMO.

I find the distinction somewhat blurry and think both would be
appropriate. But since I wasn't sure I changed to detail.

 If we need errcontext_plural, let's add it, not adopt inferior solutions
 just because that isn't there for lack of previous need.

I would've added it if I would've been sure.

 But having said that, I think this is indeed detail not context.
 (I kinda wonder whether some of the stuff that's now in the primary
 message shouldn't be pushed to errdetail as well.  It looks like some
 previous patches in this area have been lazy.)

I agree, the primary message is not very well worded. On the other
hand finding an appropriate alternative seems hard for me.

 While I'm griping, this message isn't even trying to follow the project's
 message style guidelines.  Detail or context messages are supposed to be
 complete sentence(s), with capitalization and punctuation to match.

Hm, I hope I fixed it in this version of the patch.

 Lastly, is this information that we want to be shipping to clients?
 Perhaps from a security standpoint that's not such a wise idea, and
 errdetail_log() is what should be used.

Fixed. I added an errdetail_log_plural() for this, too.

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 122afb2..552c5a4 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1195,13 +1195,23 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		lock_waiters_sbuf,
+		lock_holders_sbuf;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+		first_waiter = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(lock_waiters_sbuf);
+			initStringInfo(lock_holders_sbuf);
+
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1211,10 +1221,67 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock-procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+/*
+ * we are a waiter if myProc-waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+if (proclock-tag.myProc-waitProcLock == proclock)
+{
+	if (first_waiter)
+	{
+		appendStringInfo(lock_waiters_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_waiter = false;
+	}
+	else
+		appendStringInfo(lock_waiters_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+}
+else
+{
+	if (first_holder)
+	{
+		appendStringInfo(lock_holders_sbuf, %d,
+		 proclock-tag.myProc-pid);
+		first_holder = false;
+	}
+	else
+		appendStringInfo(lock_holders_sbuf, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+(errdetail_plural(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+  lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data;
 			else if (deadlock_state == DS_HARD_DEADLOCK)
 			{
 /*
@@ -1226,13 +1293,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  */
 ereport(LOG,
 		(errmsg(process %d detected deadlock while waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+(errdetail_plural(process owning lock: %s request queue: %s,
+			   processes owning lock: %s request queue: %s,
+  lockHoldersNum, lock_holders_sbuf.data

Re: [HACKERS] Why conf.d should be default, and auto.conf and recovery.conf should be in it

2014-01-16 Thread Christian Kruse
Hi Alvaro,

On 16/01/14 10:21, Alvaro Herrera wrote:
 1. it is to be read automatically by the server without need for an
include_dir conf.d option in the main postgresql.conf.

+1

 4. there is no point in disabling it, and thus we offer no mechanism
to do that.

Not only there is „no point“ in disabling it, it makes this feature
nearly useless. One can't rely on it if the distro may disable
it. There are so many out there, it will never be a reliable feature
if it can be disabled.

 5. its entries override what is set in postgresql.conf, and are in turn
overridden by what is set in postgresql.auto.conf.

+1

Best regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpfpMOCECyB7.pgp
Description: PGP signature


Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-01-02 Thread Christian Kruse
Hi,

On 02/01/14 10:02, Andres Freund wrote:
  Christian's idea of a context line seems plausible to me.  I don't
  care for this implementation too much --- a global variable?  Ick.
 
 Yea, the data should be stored in ErrorContextCallback.arg instead.

Fixed.

I also palloc() the ErrorContextCallback itself. But it doesn't feel
right since I could not find a piece of code doing so. What do you
think?

  Make a wrapper function for XactLockTableWait instead, please.
 
 I don't think that'd work out all too nicely - we do the
 XactLockTableWait() inside other functions like MultiXactIdWait(),
 passing all the detail along for those would end up being ugly. So using
 error context callbacks properly seems like the best way in the end.

Wrapping XactLockTableWait()/MultiXactIdWait() at least allows the
ErrorContextCallback and ErrorContextCallback.arg to live on the
stack. So what we could do is wrap this in a macro instead of a
function (like e.g. PG_TRY) or write two different functions. And I
don't like the two functions approach since it means duplication of
code.

While writing the response I really think writing a macro in PG_TRY
style for setting up and cleaning the error context callback I begin
to think that this would be the best way to go.

  And I'm not real sure that showing the whole tuple contents is a good
  thing; I can see people calling that a security leak, not to mention
  that the performance consequences could be dire.
 
 I don't think that the security argument has too much merit given
 today's PG - we already log the full tuple for various constraint
 violations. And we also accept the performance penalty there.  I don't
 see any easy way to select a sensible subset of columns to print here,
 and printing the columns is what would make investigating issues around
 this so much easier. At the very least we'd need to print the pkey
 columns and the columns of the unique key that might have been violated.

I agree. Even printing only the PK values would be much more
complex. As far as I can see we would even have to gain another lock
for this (see comment for relationHasPrimaryKey() in
src/backend/catalog/index.c).

Regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f8545c1..cfa49c2 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2702,9 +2702,11 @@ l1:
 
 		if (infomask  HEAP_XMAX_IS_MULTI)
 		{
+			XactLockTableWaitSetupErrorContextCallback(relation, tp);
 			/* wait for multixact */
 			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
 			NULL, infomask);
+			XactLockTableWaitCleanupErrorContextCallback();
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2730,7 +2732,10 @@ l1:
 		else
 		{
 			/* wait for regular transaction to end */
+			XactLockTableWaitSetupErrorContextCallback(relation, tp);
+
 			XactLockTableWait(xwait);
+			XactLockTableWaitCleanupErrorContextCallback();
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3254,9 +3259,11 @@ l2:
 			TransactionId update_xact;
 			int			remain;
 
+			XactLockTableWaitSetupErrorContextCallback(relation, oldtup);
 			/* wait for multixact */
 			MultiXactIdWait((MultiXactId) xwait, mxact_status, remain,
 			infomask);
+			XactLockTableWaitCleanupErrorContextCallback();
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3330,7 +3337,10 @@ l2:
 			else
 			{
 /* wait for regular transaction to end */
+XactLockTableWaitSetupErrorContextCallback(relation, oldtup);
+
 XactLockTableWait(xwait);
+XactLockTableWaitCleanupErrorContextCallback();
 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 /*
@@ -4398,7 +4408,11 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
+{
+	XactLockTableWaitSetupErrorContextCallback(relation, tuple);
 	MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
+	XactLockTableWaitCleanupErrorContextCallback();
+}
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -4453,7 +4467,11 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
+{
+	XactLockTableWaitSetupErrorContextCallback(relation, tuple);
 	XactLockTableWait(xwait);
+	XactLockTableWaitCleanupErrorContextCallback();
+}
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -5139,9 +5157,14 @@ l4:
 
 	if (needwait)
 	{
+		XactLockTableWaitSetupErrorContextCallback(rel, mytup);
+
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 		XactLockTableWait(members[i].xid);
 		pfree(members);
+
+		XactLockTableWaitCleanupErrorContextCallback();
+
 		goto l4;
 	}
 	if (res != HeapTupleMayBeUpdated)
@@ -5199,8 +5222,13 @@ l4:
  needwait);
 if (needwait

Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-01-01 Thread Christian Kruse
Hi,

On 31/12/13 13:56, Peter Geoghegan wrote:
 I think that this is a worthwhile effort, but like Tom I don't think
 that it's universally true that these waiters are waiting on a tuple
 lock. Most obviously, the XactLockTableWait() call you've modified
 within nbtinsert.c is not.

This is why I don't set the tuple data in this case:

 XactLockTableWaitSetupErrorContextCallback(rel, NULL);

The second argument is the tuple argument. If it is set to NULL in the
error context callback, all output regarding tuple are suppressed.

 ISTM that you should be printing just the value and the unique index
 there, and not any information about the tuple proper.

Good point, I will have a look at this.

 For better direction about where that new
 infrastructure ought to live, you might find some useful insight from
 commit 991f3e5ab3f8196d18d5b313c81a5f744f3baaea.

Thanks for the pointer!

But, to be honest, I am still unsure where to put this. As far as I
understand this commit has substantial parts in relcache.c and
elog.c – both don't seem to be very good fitting places?

Regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpFTITwzCXuK.pgp
Description: PGP signature


Re: [HACKERS] Patch: show relation and tuple infos of a lock to acquire

2014-01-01 Thread Christian Kruse
On 31/12/13 11:36, Tom Lane wrote:
 Christian's idea of a context line seems plausible to me.  I don't
 care for this implementation too much --- a global variable?  Ick.
 Make a wrapper function for XactLockTableWait instead, please.

Point taken. You are right.

 And I'm not real sure that showing the whole tuple contents is a good
 thing; I can see people calling that a security leak, not to mention
 that the performance consequences could be dire.

What do you suggest to include? Having some information to identify
the tuple may be very helpful for debugging. This is why I did it this
way.

Regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgphhZyFupMjQ.pgp
Description: PGP signature


[HACKERS] Patch: show relation and tuple infos of a lock to acquire

2013-12-31 Thread Christian Kruse
Hi there,

I created a patch improving the log_lock_wait messages by adding
relation infos (name and OID) as well as tuple infos (if present –
ctid and, if available, the tuple itself) in the context.

Sample output (log_lock_waits=on required):

session 1:
CREATE TABLE foo (val integer);
INSERT INTO foo (val) VALUES (1);
BEGIN;
UPDATE foo SET val = 3;

session 2:
BEGIN;
UPDATE TABLE foo SET val = 2;

Output w/o patch:

LOG:  process 24774 acquired ShareLock on transaction 696 after 11688.720 ms

Output with patch:

LOG:  process 24774 acquired ShareLock on transaction 696 after 11688.720 ms
CONTEXT:  relation name: foo (OID 16385)
tuple (ctid (0,1)): (1)

I am not really sure where to put the functions. Currently they are
located in backend/storage/lmgr/lmgr.c because XactLockTableWait() is
located there, too. What do you think?

I also created a test spec for easy creation of the log output;
however, I was not able to provide an expected file since the process
IDs vary from test run to test run.

Regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f8545c1..cfa49c2 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2702,9 +2702,11 @@ l1:
 
 		if (infomask  HEAP_XMAX_IS_MULTI)
 		{
+			XactLockTableWaitSetupErrorContextCallback(relation, tp);
 			/* wait for multixact */
 			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
 			NULL, infomask);
+			XactLockTableWaitCleanupErrorContextCallback();
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2730,7 +2732,10 @@ l1:
 		else
 		{
 			/* wait for regular transaction to end */
+			XactLockTableWaitSetupErrorContextCallback(relation, tp);
+
 			XactLockTableWait(xwait);
+			XactLockTableWaitCleanupErrorContextCallback();
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3254,9 +3259,11 @@ l2:
 			TransactionId update_xact;
 			int			remain;
 
+			XactLockTableWaitSetupErrorContextCallback(relation, oldtup);
 			/* wait for multixact */
 			MultiXactIdWait((MultiXactId) xwait, mxact_status, remain,
 			infomask);
+			XactLockTableWaitCleanupErrorContextCallback();
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -3330,7 +3337,10 @@ l2:
 			else
 			{
 /* wait for regular transaction to end */
+XactLockTableWaitSetupErrorContextCallback(relation, oldtup);
+
 XactLockTableWait(xwait);
+XactLockTableWaitCleanupErrorContextCallback();
 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 /*
@@ -4398,7 +4408,11 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
+{
+	XactLockTableWaitSetupErrorContextCallback(relation, tuple);
 	MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
+	XactLockTableWaitCleanupErrorContextCallback();
+}
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -4453,7 +4467,11 @@ l3:
 		RelationGetRelationName(relation;
 }
 else
+{
+	XactLockTableWaitSetupErrorContextCallback(relation, tuple);
 	XactLockTableWait(xwait);
+	XactLockTableWaitCleanupErrorContextCallback();
+}
 
 /* if there are updates, follow the update chain */
 if (follow_updates 
@@ -5139,9 +5157,14 @@ l4:
 
 	if (needwait)
 	{
+		XactLockTableWaitSetupErrorContextCallback(rel, mytup);
+
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 		XactLockTableWait(members[i].xid);
 		pfree(members);
+
+		XactLockTableWaitCleanupErrorContextCallback();
+
 		goto l4;
 	}
 	if (res != HeapTupleMayBeUpdated)
@@ -5199,8 +5222,13 @@ l4:
  needwait);
 if (needwait)
 {
+	XactLockTableWaitSetupErrorContextCallback(rel, mytup);
+
 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 	XactLockTableWait(rawxmax);
+
+	XactLockTableWaitCleanupErrorContextCallback();
+
 	goto l4;
 }
 if (res != HeapTupleMayBeUpdated)
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index a452fea..7f79615 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -165,7 +165,11 @@ top:
 		{
 			/* Have to wait for the other guy ... */
 			_bt_relbuf(rel, buf);
+
+			XactLockTableWaitSetupErrorContextCallback(rel, NULL);
 			XactLockTableWait(xwait);
+			XactLockTableWaitCleanupErrorContextCallback();
+
 			/* start over... */
 			_bt_freestack(stack);
 			goto top;
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 7ad9720..6e80e37 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2297,7 +2297,11 @@ IndexBuildHeapScan(Relation heapRelation,
 			 * Must drop the lock on the buffer before we wait
 			 */
 			LockBuffer(scan-rs_cbuf, BUFFER_LOCK_UNLOCK

Re: [HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2013-12-31 Thread Christian Kruse
Hi,

On 31/12/13 08:48, Simon Riggs wrote:
  I created two patches..
 
 Patches are related but separate, so should be tracked on separate
 threads.

[x] Done (in 20131231091244.gb25...@defunct.ch)

 Please add them to the CF app also.

[x] Done. I modified the existing commitfest entry
(https://commitfest.postgresql.org/action/patch_view?id=1350) and
the second, new one is located here:

https://commitfest.postgresql.org/action/patch_view?id=1351

Regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpfpe4EtSAQ4.pgp
Description: PGP signature


[HACKERS] Patch: Show process IDs of processes holding a lock; show relation and tuple infos of a lock to acquire

2013-12-30 Thread Christian Kruse
Hi there,

I created two patches improving the log messages generated by
log_lock_waits. The first patch shows the process IDs holding a lock
we try to acquire (show_pids_in_lock_log.patch), sample output
(log_lock_waits=on required):

session 1: BEGIN; LOCK TABLE foo IN SHARE MODE;
session 2: BEGIN; LOCK TABLE foo IN SHARE MODE;
session 3: BEGIN; LOCK TABLE foo IN EXCLUSIVE MODE;

Output w/o patch:

LOG:  process 13777 still waiting for ExclusiveLock on relation 16385 of 
database 16384 after 1000.030 ms

Output with patch:

LOG:  process 13777 still waiting for ExclusiveLock on relation 16385 of 
database 16384 after 1000.030 ms
CONTEXT:  processes owning lock: 13775, 13776

The second patch (show_table_name_and_tuple_in_lock_log.patch)
includes relation info (table name and OID) as well as some tuple
information (if available). Sample output (log_lock_waits=on required):

session 1:
CREATE TABLE foo (val integer);
INSERT INTO foo (val) VALUES (1);
BEGIN;
UPDATE foo SET val = 3;

session 2:
BEGIN;
UPDATE TABLE foo SET val = 2;

Output w/o patch:

LOG:  process 24774 acquired ShareLock on transaction 696 after 11688.720 ms

Output with patch:

LOG:  process 24774 acquired ShareLock on transaction 696 after 11688.720 ms
CONTEXT:  relation name: foo (OID 16385)
tuple (ctid (0,1)): (1)


Regarding this patch I am not really sure where to put the
functions. Currently they are located in backend/storage/lmgr/lmgr.c
because XactLockTableWait() is located there, too. What do you think?


I also created two test specs for easy creation of the log output;
however, I was not able to provide an expected file since the process
IDs vary from test run to test run.

Regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 122afb2..fdfacdf 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1195,13 +1195,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		 */
 		if (log_lock_waits  deadlock_state != DS_NOT_YET_CHECKED)
 		{
-			StringInfoData buf;
+			StringInfoData buf,
+		buf1;
 			const char *modename;
 			long		secs;
 			int			usecs;
 			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first = true;
+			int			lockHoldersNum = 0;
 
 			initStringInfo(buf);
+			initStringInfo(buf1);
 			DescribeLockTag(buf, locallock-tag.lock);
 			modename = GetLockmodeName(locallock-tag.lock.locktag_lockmethodid,
 	   lockmode);
@@ -1211,10 +1217,42 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			msecs = secs * 1000 + usecs / 1000;
 			usecs = usecs % 1000;
 
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			procLocks = (lock-procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+			   offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+if (MyProc-pid != proclock-tag.myProc-pid)
+{
+	if (first)
+	{
+		appendStringInfo(buf1, %d,
+		 proclock-tag.myProc-pid);
+		first = false;
+	}
+	else
+		appendStringInfo(buf1, , %d,
+		 proclock-tag.myProc-pid);
+
+	lockHoldersNum++;
+}
+
+proclock = (PROCLOCK *) SHMQueueNext(procLocks, proclock-lockLink,
+			   offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
 			if (deadlock_state == DS_SOFT_DEADLOCK)
 ereport(LOG,
 		(errmsg(process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errcontext(ngettext(process owning lock: %s,
+			  processes owning lock: %s,
+			  lockHoldersNum), buf1.data;
 			else if (deadlock_state == DS_HARD_DEADLOCK)
 			{
 /*
@@ -1226,13 +1264,19 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  */
 ereport(LOG,
 		(errmsg(process %d detected deadlock while waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errcontext(ngettext(process owning lock: %s,
+			  processes owning lock: %s,
+			  lockHoldersNum), buf1.data;
 			}
 
 			if (myWaitStatus == STATUS_WAITING)
 ereport(LOG,
 		(errmsg(process %d still waiting for %s on %s after %ld.%03d ms,
-			  MyProcPid, modename, buf.data, msecs, usecs)));
+MyProcPid, modename, buf.data, msecs, usecs),
+		 (errcontext(ngettext(process owning lock: %s,
+			  processes owning lock: %s,
+			  lockHoldersNum), buf1.data;
 			else if (myWaitStatus == STATUS_OK)
 ereport(LOG,
 	(errmsg(process %d acquired %s on %s after %ld.%03d ms,
@@ -1252,7 +1296,10 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 if (deadlock_state

Re: [HACKERS] Patch: show xid and xmin in pg_stat_activity and pg_stat_replication

2013-12-18 Thread Christian Kruse
Hi,

On 17/12/13 12:08, Robert Haas wrote:
 Please add your patch here so we don't lose track of it:
 
 https://commitfest.postgresql.org/action/commitfest_view/open

Thanks. I nearly forgot that.

Regards,

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgp5adohHq0q6.pgp
Description: PGP signature


Re: [HACKERS] Time-Delayed Standbys

2013-12-04 Thread Christian Kruse
Hi,

On 04/12/13 11:13, KONDO Mitsumasa wrote:
 1) Clusters
 - build master
 - build slave and attach to the master using SR and config 
 recovery_time_delay to
 1min.
 
 2) Stop de Slave
 
 3) Run some transactions on the master using pgbench to generate a lot of 
 archives
 
 4) Start the slave and connect to it using psql and in another session I can 
 see
 all archive recovery log
 Hmm... I had thought my mistake in reading your email, but it reproduce again.
 When I sat small recovery_time_delay(=3), it might work collectry.
 However, I sat long timed recovery_time_delay(=300), it didn't work.
 […]

I'm not sure if I understand your problem correctly. I try to
summarize, please correct if I'm wrong:

You created a master node and a hot standby with 300 delay. Then
you stopped the standby, did the pgbench and startet the hot standby
again. It did not get in line with the master. Is this correct?

I don't see a problem here… the standby should not be in sync with the
master, it should be delayed. I did step by step what you did and
after 50 minutes (300ms) the standby was at the same level the
master was.

Did I missunderstand you?

Regards,
 Christian Kruse

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgp7HACTkLsby.pgp
Description: PGP signature


Re: [HACKERS] Time-Delayed Standbys

2013-12-04 Thread Christian Kruse
Hi,

On 04/12/13 07:22, Kevin Grittner wrote:
 There are many things that a system admin can get wrong.  Failing
 to supply this feature because the sysadmin might not be running
 ntpd (or equivalent) correctly seems to me to be like not having
 the software do fsync because the sysadmin might not have turned
 off write-back buffering on drives without persistent storage. 
 Either way, poor system management can defeat the feature.  Either
 way, I see no reason to withhold the feature from those who manage
 their systems in a sane fashion.

I agree. But maybe we should add a warning in the documentation about
time syncing?

Greetings,
 CK

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgp929ckT_fsN.pgp
Description: PGP signature


Re: [HACKERS] Time-Delayed Standbys

2013-12-03 Thread Christian Kruse
Hi Fabrizio,

looks good to me. I did some testing on 9.2.4, 9.2.5 and HEAD. It
applies and compiles w/o errors or warnings. I set up a master and two
hot standbys replicating from the master, one with 5 minutes delay and
one without delay. After that I created a new database and generated
some test data:

CREATE TABLE test (val INTEGER);
INSERT INTO test (val) (SELECT * FROM generate_series(0, 100));

The non-delayed standby nearly instantly had the data replicated, the
delayed standby was replicated after exactly 5 minutes. I did not
notice any problems, errors or warnings.

Greetings,
 CK

-- 
 Christian Kruse   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training  Services



pgpok2vtj3rMM.pgp
Description: PGP signature


Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-11-01 Thread Christian Kruse
Hi,

On Monday 29 October 2012 16:33:25 Tom Lane wrote:
 Christian Kruse cjk+postg...@defunct.ch writes:
  I created a patch which implements MAP_HUGETLB for sysv shared memory
  segments (PGSharedMemoryCreate). It is based on tests of Tom Lane and
  Andres Freund, I added error handling, huge page size detection and a GUC
  variable.
 My recollection is we'd decided not to pursue that because of fears that
 it could make performance horribly worse on some platforms.

This is why I created a GUC variable huge_tlb_pages = on|off|try to 
enable/disable this feature.

A performance improvement up to 10% seems worth the trouble of setting another 
config value, IMO.

Greetings,
 CK



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] NOWAIT doesn't work

2012-10-31 Thread Christian Kruse
Hi,

On Wednesday, October 31, 2012 02:51:38 PM Pavel Stehule wrote:
 Hello
 
 it is expected behave?
 
 1.session
 
 postgres=# begin;
 BEGIN
 postgres=# lock oo IN ACCESS EXCLUSIVE MODE;
 LOCK TABLE
 
 2. session
 
 postgres=# select * from oo for update nowait;
 
 hangs forever 

Yes, I think so. From the documentation:

Note that NOWAIT applies only to the row-level lock(s)
(http://www.postgresql.org/docs/9.2/static/sql-select.html)

Greetings,
 CK




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-31 Thread Christian Kruse
Hey,

On 30/10/12 20:33, Andres Freund wrote:
 +#ifdef MAP_HUGETLB
 +#  ifdef __ia64__
 +#define PG_HUGETLB_BASE_ADDR (void *)(0x8000UL)
 +#define PG_MAP_HUGETLB (MAP_HUGETLB|MAP_FIXED)
 +#  else

 Not your fault, but that looks rather strange to me. The level of 
 documentation
 around the requirement of using MAP_FIXED is subpar...

Yeah, looks strange to me, too. This is why I asked if this is really
all we have to do.

 I dislike automatically using the biggest size here. There are platforms with
 16GB hugepages... I think we should rather always use the smallest size and
 leave it to the kernel to choose whatever pagesize he wants to use.

Good point. I will change this to the smallest available page size.

 Thats going to log two warnings if the current system doesn't have hugepage
 support compiled in and thus no /sys/kernel/mm/hugepages directory.
 I think you should 1. only test this if TRY or ON is configured, 2. make the
 messages in InternalGetHugepageSize DEBUG1 at least for the TRY case.

Ok, point taken.

 […]
 + HUGE_TLB_TRY, huge_tlb_options,
 + NULL, NULL, NULL
 + },

 you use HUGE_TLB_TRY with ifdef here.

Ah, right. Setting it to HUGE_TLB_OFF when no MAP_HUGETLB is available.

Greetings,
 CK


pgprK0vhwogK9.pgp
Description: PGP signature


Re: [HACKERS] Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-30 Thread Christian Kruse
Hi,

On 29/10/12 21:14, Peter Geoghegan wrote:
 I have a few initial observations on this.

Thanks for your feedback.


 * I think you should be making the new GUC PGC_INTERNAL on platforms
 where MAP_HUGETLB is not defined or available. See also,
 effective_io_concurrency. This gives sane error handling.

Ok, sounds good for me.

 * Why aren't you failing when HUGE_TLB_ON and the mmap fails? You do
 the same thing as HUGE_TLB_TRY, contrary to your documentation:

  +if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == 
 HUGE_TLB_TRY)

No, what I did was mmap()ing the memory with MAP_HUGETLB and falling
back to mmap() w/o MAP_HUGETLB when mmap() failed. But there was
another bug, I didn't mmap() at all when huge_tlb_pages == off.

 * I think you should add MAP_HUGETLB to PG_MMAP_FLAGS directly, rather
 than XOR'ing after the fact. We already build the flags that comprise
 PG_MMAP_FLAGS using another set of intermediary flags, based on
 platform considerations, so what you're doing here is just
 inconsistent.

This would mean that I have to disable the bit when huge_tlb_pages ==
off. I think it is better to enable it if wanted and to just leave the
flags as they were when this feature has been turned off, do you
disagree?

 Don't wrap the mmap() code in ifdefs, and instead rely
 on the GUC being available on all platforms, with setting the GUC
 causing an error on unsupported platforms, in the style of
 effective_io_concurrency, as established in commit
 3b07182e613a0e63ff662e3dc7f820f010923ec7 . Build a PG_MAP_HUGETLB
 intermediary flag on all platforms.

Ok, this sounds good. It will remove complexity from the code.

 * Apparently you're supposed to do this for the benefit of Itanic [1]:

 /* Only ia64 requires this */
 #ifdef __ia64__
 #define ADDR (void *)(0x8000UL)
 #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
 #else
 #define ADDR (void *)(0x0UL)
 #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
 #endif

Hm… is this enough? Or do we have to do more work for ia64?

 * You aren't following our style guide with respect to error messages [2].

Thanks, I wasn't aware of this. I attached a new version of the patch.

Greetings,
 CK


pgp3AmMtkv8SY.pgp
Description: PGP signature


Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-30 Thread Christian Kruse
Hi,

On 29/10/12 16:33, Tom Lane wrote:
  I created a patch which implements MAP_HUGETLB for sysv shared memory 
  segments
  (PGSharedMemoryCreate). It is based on tests of Tom Lane and Andres Freund, 
  I
  added error handling, huge page size detection and a GUC variable.

 My recollection is we'd decided not to pursue that because of fears that
 it could make performance horribly worse on some platforms.

This is why I created a GUC for turning this feature off.

Greetings,
 CK


pgpADXeFXTb5Z.pgp
Description: PGP signature


Re: [HACKERS] Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-30 Thread Christian Kruse
Hey,

Oh man, first I didn't sent the email to the list and now I forgot the
attachment. I should really get some sleep, sorry for any
inconveniences :(

Greetings,
 CK
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b4fcbaf..66ed10f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1049,6 +1049,37 @@ include 'filename'
   /listitem
  /varlistentry
 
+ varlistentry id=guc-huge-tlb-pages xreflabel=huge_tlb_pages
+  termvarnamehuge_tlb_pages/varname (typeenum/type)/term
+  indexterm
+   primaryvarnamehuge_tlb_pages/ configuration parameter/primary
+  /indexterm
+  listitem
+   para
+Enables/disables the use of huge tlb pages. Valid values are
+literalon/literal, literaloff/literal and 
literaltry/literal.
+The default value is literaltry/literal.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literalon/literal
+symbolmmap()/symbol will be called with 
symbolMAP_HUGETLB/symbol.
+If the call fails the server will fail fatally.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literaloff/literal we
+will not use symbolMAP_HUGETLB/symbol at all.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literaltry/literal
+we will try to use symbolMAP_HUGETLB/symbol and fall back to
+symbolmmap()/symbol without symbolMAP_HUGETLB/symbol.
+   /para
+  /listitem
+ /varlistentry
+
  varlistentry id=guc-temp-buffers xreflabel=temp_buffers
   termvarnametemp_buffers/varname (typeinteger/type)/term
   indexterm
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index df06312..73cfdef 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -27,10 +27,14 @@
 #ifdef HAVE_SYS_SHM_H
 #include sys/shm.h
 #endif
+#ifdef MAP_HUGETLB
+#include dirent.h
+#endif
 
 #include miscadmin.h
 #include storage/ipc.h
 #include storage/pg_shmem.h
+#include utils/guc.h
 
 
 typedef key_t IpcMemoryKey;/* shared memory key passed to 
shmget(2) */
@@ -61,6 +65,19 @@ typedef int IpcMemoryId; /* shared memory ID 
returned by shmget(2) */
 #define MAP_FAILED ((void *) -1)
 #endif
 
+#ifdef MAP_HUGETLB
+#  ifdef __ia64__
+#define PG_HUGETLB_BASE_ADDR (void *)(0x8000UL)
+#define PG_MAP_HUGETLB (MAP_HUGETLB|MAP_FIXED)
+#  else
+#define PG_HUGETLB_BASE_ADDR (void *)(0x0UL)
+#define PG_MAP_HUGETLB MAP_HUGETLB
+#  endif
+#else
+#  define PG_MAP_HUGETLB 0
+#endif
+
+
 
 unsigned long UsedShmemSegID = 0;
 void  *UsedShmemSegAddr = NULL;
@@ -342,6 +359,154 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long 
id2)
 }
 
 
+#ifdef MAP_HUGETLB
+#define HUGE_PAGE_INFO_DIR  /sys/kernel/mm/hugepages
+
+/*
+ * static long InternalGetFreeHugepagesCount(const char *name)
+ *
+ * Attempt to read the number of available hugepages from
+ * /sys/kernel/mm/hugepages/hugepages-size/free_hugepages
+ * Will fail (return -1) if file could not be opened, 0 if no pages are 
available
+ * and  0 if there are free pages
+ *
+ */
+static long
+InternalGetFreeHugepagesCount(const char *name)
+{
+   int fd;
+   char buff[1024];
+   size_t len;
+   long result;
+   char *ptr;
+
+   len = snprintf(buff, 1024, %s/%s/free_hugepages, HUGE_PAGE_INFO_DIR, 
name);
+   if (len == 1024) /* I don't think that this will happen ever */
+   {
+   ereport(WARNING,
+   (errmsg(Filename %s/%s/free_hugepages is too 
long, HUGE_PAGE_INFO_DIR, name),
+errcontext(while checking hugepage size)));
+   return -1;
+   }
+
+   fd = open(buff, O_RDONLY);
+   if (fd = 0)
+   {
+   ereport(WARNING,
+   (errmsg(Could not open file %s: %s, buff, 
strerror(errno)),
+errcontext(while checking hugepage size)));
+   return -1;
+   }
+
+   len = read(fd, buff, 1024);
+   if (len = 0)
+   {
+   ereport(WARNING,
+   (errmsg(Error reading from file %s: %s, buff, 
strerror(errno)),
+errcontext(while checking hugepage size)));
+   close(fd);
+   return -1;
+   }
+
+   /*
+* If the content of free_hugepages is longer than or equal to 1024 
bytes
+* the rest is irrelevant; we simply want to know if there are any
+* hugepages left
+*/
+   if (len == 1024)
+   {
+   buff[1023] = 0;
+   }
+   else
+   {
+   buff[len] = 0;
+   }
+
+   close(fd);
+
+   result = strtol(buff, ptr, 10);
+
+   if (ptr == NULL)
+   {
+   ereport(WARNING,
+   (errmsg(Could not convert contents of file 
%s/%s/free_hugepages 

Re: [HACKERS] Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-30 Thread Christian Kruse
Hey,

On 30/10/12 20:33, Andres Freund wrote:
 +#ifdef MAP_HUGETLB
 +#  ifdef __ia64__
 +#define PG_HUGETLB_BASE_ADDR (void *)(0x8000UL)
 +#define PG_MAP_HUGETLB (MAP_HUGETLB|MAP_FIXED)
 +#  else

 Not your fault, but that looks rather strange to me. The level of 
 documentation
 around the requirement of using MAP_FIXED is subpar...

Yeah, looks strange to me, too. This is why I asked if this is really
all we have to do.

 I dislike automatically using the biggest size here. There are platforms with
 16GB hugepages... I think we should rather always use the smallest size and
 leave it to the kernel to choose whatever pagesize he wants to use.

Good point. I will change this to the smallest available page size.

 Thats going to log two warnings if the current system doesn't have hugepage
 support compiled in and thus no /sys/kernel/mm/hugepages directory.
 I think you should 1. only test this if TRY or ON is configured, 2. make the
 messages in InternalGetHugepageSize DEBUG1 at least for the TRY case.

Ok, point taken.

 […]
 + HUGE_TLB_TRY, huge_tlb_options,
 + NULL, NULL, NULL
 + },

 you use HUGE_TLB_TRY with ifdef here.

Ah, right. Setting it to HUGE_TLB_OFF when no MAP_HUGETLB is available.

Greetings,
 CK


pgp6PpeR5Kpxp.pgp
Description: PGP signature


Re: [HACKERS] Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-30 Thread Christian Kruse
Hey,

ok, I think I implemented all of the changes you requested. All but
the ia64 dependent, I have to do more research for this one.


Greetings,
 CK
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b4fcbaf..66ed10f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1049,6 +1049,37 @@ include 'filename'
   /listitem
  /varlistentry
 
+ varlistentry id=guc-huge-tlb-pages xreflabel=huge_tlb_pages
+  termvarnamehuge_tlb_pages/varname (typeenum/type)/term
+  indexterm
+   primaryvarnamehuge_tlb_pages/ configuration parameter/primary
+  /indexterm
+  listitem
+   para
+Enables/disables the use of huge tlb pages. Valid values are
+literalon/literal, literaloff/literal and 
literaltry/literal.
+The default value is literaltry/literal.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literalon/literal
+symbolmmap()/symbol will be called with 
symbolMAP_HUGETLB/symbol.
+If the call fails the server will fail fatally.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literaloff/literal we
+will not use symbolMAP_HUGETLB/symbol at all.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literaltry/literal
+we will try to use symbolMAP_HUGETLB/symbol and fall back to
+symbolmmap()/symbol without symbolMAP_HUGETLB/symbol.
+   /para
+  /listitem
+ /varlistentry
+
  varlistentry id=guc-temp-buffers xreflabel=temp_buffers
   termvarnametemp_buffers/varname (typeinteger/type)/term
   indexterm
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index df06312..f9de239 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -27,10 +27,14 @@
 #ifdef HAVE_SYS_SHM_H
 #include sys/shm.h
 #endif
+#ifdef MAP_HUGETLB
+#include dirent.h
+#endif
 
 #include miscadmin.h
 #include storage/ipc.h
 #include storage/pg_shmem.h
+#include utils/guc.h
 
 
 typedef key_t IpcMemoryKey;/* shared memory key passed to 
shmget(2) */
@@ -61,6 +65,19 @@ typedef int IpcMemoryId; /* shared memory ID 
returned by shmget(2) */
 #define MAP_FAILED ((void *) -1)
 #endif
 
+#ifdef MAP_HUGETLB
+#  ifdef __ia64__
+#define PG_HUGETLB_BASE_ADDR (void *)(0x8000UL)
+#define PG_MAP_HUGETLB (MAP_HUGETLB|MAP_FIXED)
+#  else
+#define PG_HUGETLB_BASE_ADDR (void *)(0x0UL)
+#define PG_MAP_HUGETLB MAP_HUGETLB
+#  endif
+#else
+#  define PG_MAP_HUGETLB 0
+#endif
+
+
 
 unsigned long UsedShmemSegID = 0;
 void  *UsedShmemSegAddr = NULL;
@@ -73,7 +90,6 @@ static void IpcMemoryDelete(int status, Datum shmId);
 static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key,
 IpcMemoryId *shmid);
 
-
 /*
  * InternalIpcMemoryCreate(memKey, size)
  *
@@ -342,6 +358,155 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long 
id2)
 }
 
 
+#ifdef MAP_HUGETLB
+#define HUGE_PAGE_INFO_DIR  /sys/kernel/mm/hugepages
+
+/*
+ * static long InternalGetFreeHugepagesCount(const char *name)
+ *
+ * Attempt to read the number of available hugepages from
+ * /sys/kernel/mm/hugepages/hugepages-size/free_hugepages
+ * Will fail (return -1) if file could not be opened, 0 if no pages are 
available
+ * and  0 if there are free pages
+ *
+ */
+static long
+InternalGetFreeHugepagesCount(const char *name)
+{
+   int fd;
+   char buff[1024];
+   size_t len;
+   long result;
+   char *ptr;
+
+   len = snprintf(buff, 1024, %s/%s/free_hugepages, HUGE_PAGE_INFO_DIR, 
name);
+   if (len == 1024) /* I don't think that this will happen ever */
+   {
+   ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+   (errmsg(Filename %s/%s/free_hugepages is too 
long, HUGE_PAGE_INFO_DIR, name),
+errcontext(while checking hugepage size)));
+   return -1;
+   }
+
+   fd = open(buff, O_RDONLY);
+   if (fd = 0)
+   {
+   ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+   (errmsg(Could not open file %s: %s, buff, 
strerror(errno)),
+errcontext(while checking hugepage size)));
+   return -1;
+   }
+
+   len = read(fd, buff, 1024);
+   if (len = 0)
+   {
+   ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+   (errmsg(Error reading from file %s: %s, buff, 
strerror(errno)),
+errcontext(while checking hugepage size)));
+   close(fd);
+   return -1;
+   }
+
+   /*
+* If the content of free_hugepages is longer than or equal to 1024 
bytes
+* the rest is irrelevant; we simply want to know if there are any
+* hugepages left
+*/
+   

Re: [HACKERS] Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-30 Thread Christian Kruse
Hey,

On Tuesday 30 October 2012 20:33:18 Andres Freund wrote:
 +#ifdef MAP_HUGETLB
 +#  ifdef __ia64__
 +#define PG_HUGETLB_BASE_ADDR (void *)(0x8000UL)
 +#define PG_MAP_HUGETLB (MAP_HUGETLB|MAP_FIXED)
 +#  else
 
 Not your fault, but that looks rather strange to me. The level of
 documentation around the requirement of using MAP_FIXED is subpar...

Ok, after further reading I think with MAP_FIXED one has to „generate“ SHM 
segment addresses hisself. So 0x8000UL would just be one possible 
location for a hugepage.

Since this feature is not used in PG for now, the code should work. Could 
someone with access to a ia64 architecture verify it?

Greetings,
 CK



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] Patch für MAP_HUGETLB for mmap() shared memory

2012-10-29 Thread Christian Kruse
Hey,

this is my first post to the -hackers lists, so be merciful ;-)

I created a patch which implements MAP_HUGETLB for sysv shared memory segments 
(PGSharedMemoryCreate). It is based on tests of Tom Lane and Andres Freund, I 
added error handling, huge page size detection and a GUC variable.

Performance improvements differ from about 1% in the worst case to about 13% in 
the best case. Benchmarking results are as follows:

pgbench -i -s 100 test

Patched:
pgbench -n -S -j 64 -c 64 -T 10 -M prepared test
tps avg: 51879.2


Unpatched:
pgbench -n -S -j 64 -c 64 -T 10 -M prepared test
tps avg: 45321.6

tps increase: 6557.6, 12.6%


Patched:
pgbench -n -S -j 64 -c 64 -T 180 -M prepared test (patched)

number of transactions actually processed: 8767510
tps = 48705.159196 (including connections establishing)
tps = 48749.761241 (excluding connections establishing)


Unpatched:
mit pgbench -n -S -j 64 -c 64 -T 120 -M prepared test (unpatched)

number of transactions actually processed: 8295439
tps = 46083.559187 (including connections establishing)
tps = 46097.763939 (excluding connections establishing)

tps diff: 2652, 5%



create table large (a int, b int);
insert into large (a, b) select s, s + 10 from generate_series(1, 1000) s;

5 times executed, with \timing on:
SELECT sum(a), sum(b) from large;
Time: 1143.880 ms unpatched
Time: 1125.644 ms patched
about 1% difference


The patch ist attached. Any comments?

Greetings,
 CK
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b4fcbaf..66ed10f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1049,6 +1049,37 @@ include 'filename'
   /listitem
  /varlistentry
 
+ varlistentry id=guc-huge-tlb-pages xreflabel=huge_tlb_pages
+  termvarnamehuge_tlb_pages/varname (typeenum/type)/term
+  indexterm
+   primaryvarnamehuge_tlb_pages/ configuration parameter/primary
+  /indexterm
+  listitem
+   para
+Enables/disables the use of huge tlb pages. Valid values are
+literalon/literal, literaloff/literal and literaltry/literal.
+The default value is literaltry/literal.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literalon/literal
+symbolmmap()/symbol will be called with symbolMAP_HUGETLB/symbol.
+If the call fails the server will fail fatally.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literaloff/literal we
+will not use symbolMAP_HUGETLB/symbol at all.
+   /para
+
+   para
+With varnamehuge_tlb_pages/varname set to literaltry/literal
+we will try to use symbolMAP_HUGETLB/symbol and fall back to
+symbolmmap()/symbol without symbolMAP_HUGETLB/symbol.
+   /para
+  /listitem
+ /varlistentry
+
  varlistentry id=guc-temp-buffers xreflabel=temp_buffers
   termvarnametemp_buffers/varname (typeinteger/type)/term
   indexterm
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index df06312..f5d212f 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -27,6 +27,9 @@
 #ifdef HAVE_SYS_SHM_H
 #include sys/shm.h
 #endif
+#ifdef MAP_HUGETLB
+#include dirent.h
+#endif
 
 #include miscadmin.h
 #include storage/ipc.h
@@ -67,6 +70,12 @@ void	   *UsedShmemSegAddr = NULL;
 static Size AnonymousShmemSize;
 static void *AnonymousShmem;
 
+#ifdef MAP_HUGETLB
+HugeTlbType huge_tlb_pages = HUGE_TLB_TRY;
+#else
+HugeTlbType huge_tlb_pages = HUGE_TLB_OFF;
+#endif
+
 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
 static void IpcMemoryDelete(int status, Datum shmId);
@@ -342,6 +351,140 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 }
 
 
+#ifdef MAP_HUGETLB
+#define HUGE_PAGE_INFO_DIR  /sys/kernel/mm/hugepages
+
+/*
+ *	static long InternalGetFreeHugepagesCount(const char *name)
+ *
+ * Attempt to read the number of available hugepages from
+ * /sys/kernel/mm/hugepages/hugepages-size/free_hugepages
+ * Will fail (return -1) if file could not be opened, 0 if no pages are available
+ * and  0 if there are free pages
+ *
+ */
+static long
+InternalGetFreeHugepagesCount(const char *name)
+{
+	int fd;
+	char buff[1024];
+	size_t len;
+	long result;
+	char *ptr;
+
+	len = snprintf(buff, 1024, %s/%s/free_hugepages, HUGE_PAGE_INFO_DIR, name);
+	if (len == 1024) /* I don't think that this will happen ever */
+	{
+		elog(LOG, Filename %s/%s/free_hugepages is too long!, HUGE_PAGE_INFO_DIR, name);
+		return -1;
+	}
+
+	fd = open(buff, O_RDONLY);
+	if (fd = 0)
+	{
+		elog(LOG, open(%s) failed: %s, buff, strerror(errno));
+		return -1;
+	}
+
+	len = read(fd, buff, 1024);
+	if (len = 0)
+	{
+		elog(LOG, Error reading file %s: %s\n, buff, strerror(errno));
+		close(fd);
+		return -1;
+	}
+
+	/*
+	 * If the content of free_hugepages is longer than or equal to 1024 bytes
+	 * the rest is irrelevant; we simply want