Re: Add connection active, idle time to pg_stat_activity

2021-11-16 Thread Rafia Sabih
On Mon, 15 Nov 2021 at 12:40, Dilip Kumar  wrote:
>
> On Mon, Nov 15, 2021 at 4:46 PM Rafia Sabih  wrote:
> >
> > On Mon, 15 Nov 2021 at 10:24, Dilip Kumar  wrote:
> > >
> > > On Wed, Nov 10, 2021 at 1:47 PM Rafia Sabih  
> > > wrote:
> > > >
> > > > > It seems that in beentry->st_idle_time, you want to compute the
> > > > > STATE_IDLE, but that state is not handled in the outer "if", that
> > > > > means whenever it comes out of the
> > > > > STATE_IDLE, it will not enter inside this if check.  You can run and
> > > > > test, I am sure that with this patch the "idle_time" will always
> > > > > remain 0.
> > > > >
> > > > Thank you Dilip for your time on this.
> > > > And yes you are right in both your observations.
> > > > Please find the attached patch for the updated version.
> > >
> > > Looks fine now except these variable names,
> > >
> > >  PgStat_Counter pgStatTransactionIdleTime = 0;
> > > +PgStat_Counter pgStatTransactionIdleInTxnTime = 0;
> > >
> > > Now, pgStatTransactionIdleTime is collecting just the Idle time so
> > > pgStatTransactionIdleTime should be renamed to "pgStatIdleTime" and
> > > pgStatTransactionIdleInTxnTime should be renamed to
> > > "pgStatTransactionIdleTime"
> > >
> > Good point!
> > Done.
>
> @@ -1018,7 +1019,7 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg,
> TimestampTz now)
>   pgLastSessionReportTime = now;
>   tsmsg->m_session_time = (PgStat_Counter) secs * 100 + usecs;
>   tsmsg->m_active_time = pgStatActiveTime;
> - tsmsg->m_idle_in_xact_time = pgStatTransactionIdleTime;
> + tsmsg->m_idle_in_xact_time = pgStatIdleTime;
>
> I think this change is wrong,  basically, "tsmsg->m_idle_in_xact_time"
> is used for counting the database level idle in transaction count, you
> can check "pg_stat_get_db_idle_in_transaction_time" function for that.
> So "pgStatTransactionIdleTime" is the variable counting the idle in
> transaction time, pgStatIdleTime is just counting the idle time
> outside the transaction so if we make this change we are changing the
> meaning of tsmsg->m_idle_in_xact_time.

Got it.
Updated

-- 
Regards,
Rafia Sabih
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index eb560955cd..4dfa33ffa9 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -839,7 +839,10 @@ CREATE VIEW pg_stat_activity AS
 s.backend_xmin,
 S.query_id,
 S.query,
-S.backend_type
+S.backend_type,
+S.active_time,
+S.idle_in_transaction_time,
+S.idle_time
 FROM pg_stat_get_activity(NULL) AS S
 LEFT JOIN pg_database AS D ON (S.datid = D.oid)
 LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 8c166e5e16..fc5e58e06b 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -248,6 +248,7 @@ PgStat_Counter pgStatBlockReadTime = 0;
 PgStat_Counter pgStatBlockWriteTime = 0;
 static PgStat_Counter pgLastSessionReportTime = 0;
 PgStat_Counter pgStatActiveTime = 0;
+PgStat_Counter pgStatIdleTime = 0;
 PgStat_Counter pgStatTransactionIdleTime = 0;
 SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL;
 
@@ -1031,7 +1032,7 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now)
 		pgStatBlockReadTime = 0;
 		pgStatBlockWriteTime = 0;
 		pgStatActiveTime = 0;
-		pgStatTransactionIdleTime = 0;
+		pgStatIdleTime = 0;
 	}
 	else
 	{
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 7229598822..560fa0fa0c 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -577,6 +577,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 	 */
 	if ((beentry->st_state == STATE_RUNNING ||
 		 beentry->st_state == STATE_FASTPATH ||
+		 beentry->st_state == STATE_IDLE ||
 		 beentry->st_state == STATE_IDLEINTRANSACTION ||
 		 beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED) &&
 		state != beentry->st_state)
@@ -590,9 +591,21 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 
 		if (beentry->st_state == STATE_RUNNING ||
 			beentry->st_state == STATE_FASTPATH)
-			pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
-		else
+		{
+pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
+beentry->st_active_time = pgStatActiveTime;
+		}
+	

Re: Add connection active, idle time to pg_stat_activity

2021-11-15 Thread Rafia Sabih
On Mon, 15 Nov 2021 at 10:24, Dilip Kumar  wrote:
>
> On Wed, Nov 10, 2021 at 1:47 PM Rafia Sabih  wrote:
> >
> > > It seems that in beentry->st_idle_time, you want to compute the
> > > STATE_IDLE, but that state is not handled in the outer "if", that
> > > means whenever it comes out of the
> > > STATE_IDLE, it will not enter inside this if check.  You can run and
> > > test, I am sure that with this patch the "idle_time" will always
> > > remain 0.
> > >
> > Thank you Dilip for your time on this.
> > And yes you are right in both your observations.
> > Please find the attached patch for the updated version.
>
> Looks fine now except these variable names,
>
>  PgStat_Counter pgStatTransactionIdleTime = 0;
> +PgStat_Counter pgStatTransactionIdleInTxnTime = 0;
>
> Now, pgStatTransactionIdleTime is collecting just the Idle time so
> pgStatTransactionIdleTime should be renamed to "pgStatIdleTime" and
> pgStatTransactionIdleInTxnTime should be renamed to
> "pgStatTransactionIdleTime"
>
Good point!
Done.


-- 
Regards,
Rafia Sabih
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index eb560955cd..4dfa33ffa9 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -839,7 +839,10 @@ CREATE VIEW pg_stat_activity AS
 s.backend_xmin,
 S.query_id,
 S.query,
-S.backend_type
+S.backend_type,
+S.active_time,
+S.idle_in_transaction_time,
+S.idle_time
 FROM pg_stat_get_activity(NULL) AS S
 LEFT JOIN pg_database AS D ON (S.datid = D.oid)
 LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b7d0fbaefd..84c2aba9e2 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -248,6 +248,7 @@ PgStat_Counter pgStatBlockReadTime = 0;
 PgStat_Counter pgStatBlockWriteTime = 0;
 static PgStat_Counter pgLastSessionReportTime = 0;
 PgStat_Counter pgStatActiveTime = 0;
+PgStat_Counter pgStatIdleTime = 0;
 PgStat_Counter pgStatTransactionIdleTime = 0;
 SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL;
 
@@ -1018,7 +1019,7 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now)
 			pgLastSessionReportTime = now;
 			tsmsg->m_session_time = (PgStat_Counter) secs * 100 + usecs;
 			tsmsg->m_active_time = pgStatActiveTime;
-			tsmsg->m_idle_in_xact_time = pgStatTransactionIdleTime;
+			tsmsg->m_idle_in_xact_time = pgStatIdleTime;
 		}
 		else
 		{
@@ -1031,7 +1032,7 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now)
 		pgStatBlockReadTime = 0;
 		pgStatBlockWriteTime = 0;
 		pgStatActiveTime = 0;
-		pgStatTransactionIdleTime = 0;
+		pgStatIdleTime = 0;
 	}
 	else
 	{
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 7229598822..560fa0fa0c 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -577,6 +577,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 	 */
 	if ((beentry->st_state == STATE_RUNNING ||
 		 beentry->st_state == STATE_FASTPATH ||
+		 beentry->st_state == STATE_IDLE ||
 		 beentry->st_state == STATE_IDLEINTRANSACTION ||
 		 beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED) &&
 		state != beentry->st_state)
@@ -590,9 +591,21 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 
 		if (beentry->st_state == STATE_RUNNING ||
 			beentry->st_state == STATE_FASTPATH)
-			pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
-		else
+		{
+pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
+beentry->st_active_time = pgStatActiveTime;
+		}
+		else if (beentry->st_state ==  STATE_IDLEINTRANSACTION ||
+ beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED)
+		{
 			pgstat_count_conn_txn_idle_time((PgStat_Counter) secs * 100 + usecs);
+			beentry->st_transaction_idle_time = pgStatTransactionIdleTime;
+		}
+		else
+		{
+			pgstat_count_conn_idle_time((PgStat_Counter) secs * 100 + usecs);
+			beentry->st_idle_time = pgStatIdleTime;
+		}
 	}
 
 	/*
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index ff5aedc99c..4049d0679e 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -567,7 +567,7 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS)
 Datum
 pg_stat_get_activity(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_ACTIVITY_COLS	30
+#define PG_STAT_GET_ACTIVITY_COLS	33
 	int			num_backends = pgstat_fetch_stat_numbackends();
 	int			curr_backend;
 	int			pid = PG_ARGISNULL(0) ? -1 : PG_GETARG_IN

Re: ORDER BY logic in PostgreSQL source code

2021-11-10 Thread Rafia Sabih
On Wed, 10 Nov 2021 at 11:57, Sajti Zsolt Zoltán  wrote:
>
> I'm currently working on a GiST extension for PostgreSQL and I ran into 
> strange ORDER BY results during my queries.
>
> Because I can't find the problem in my source code, I want to investigate the 
> issue by looking at the PostgreSQL source,
> maybe inserting extra log messages.
> While trying to do this, I came to the conclusion that I need developer help, 
> because I can't find the part of the source code
> where the ORDER BY logic is implemented.
>
> Could you please help, by telling me which files should I take a look at?
>
> Thanks in advance!

I think you can start by looking into the function grouping_planner
and explore further by going through the route of it.

-- 
Regards,
Rafia Sabih




Re: Add connection active, idle time to pg_stat_activity

2021-11-10 Thread Rafia Sabih
On Wed, 10 Nov 2021 at 09:05, Dilip Kumar  wrote:
>
> On Tue, Nov 9, 2021 at 8:28 PM Rafia Sabih  wrote:
> >
> > On Tue, 2 Nov 2021 at 09:00, Dilip Kumar  wrote:
> > >
>
> > > About the patch, IIUC earlier all the idle time was accumulated in the
> > > "pgStatTransactionIdleTime" counter, now with your patch you have
> > > introduced one more counter which specifically tracks the
> > > STATE_IDLEINTRANSACTION state.  But my concern is that the
> > > STATE_IDLEINTRANSACTION_ABORTED is still computed under STATE_IDLE and
> > > that looks odd to me.  Either STATE_IDLEINTRANSACTION_ABORTED should
> > > be accumulated in the "pgStatTransactionIdleInTxnTime" counter or
> > > there should be a separate counter for that.  But after your patch we
> > > can not accumulate this in the "pgStatTransactionIdleTime" counter.
> > >
> > As per your comments I have added it in pgStatTransactionIdleInTxnTime.
> > Please let me know if there are any further comments.
>
> I have a few comments,
>
>  nulls[29] = true;
> +values[30] = true;
> +values[31] = true;
> +values[32] = true;
>
> This looks wrong, this should be nulls[] = true not values[]=true.
>
> if ((beentry->st_state == STATE_RUNNING ||
>   beentry->st_state == STATE_FASTPATH ||
>   beentry->st_state == STATE_IDLEINTRANSACTION ||
>   beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED) &&
>   state != beentry->st_state)
> {
> if (beentry->st_state == STATE_RUNNING ||
> beentry->st_state == STATE_FASTPATH)
> {
>pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
>   beentry->st_active_time = pgStatActiveTime;
> }
> else if (beentry->st_state == STATE_IDLEINTRANSACTION ||
>beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED)
> {
>   pgstat_count_conn_txn_idle_in_txn_time((PgStat_Counter) secs *
> 100 + usecs);
>   beentry->st_idle_in_transaction_time = pgStatTransactionIdleInTxnTime;
> }
> else
> {
>   pgstat_count_conn_txn_idle_time((PgStat_Counter) secs * 100 + usecs);
>   beentry->st_idle_time = pgStatTransactionIdleTime;
> }
>
> It seems that in beentry->st_idle_time, you want to compute the
> STATE_IDLE, but that state is not handled in the outer "if", that
> means whenever it comes out of the
> STATE_IDLE, it will not enter inside this if check.  You can run and
> test, I am sure that with this patch the "idle_time" will always
> remain 0.
>
Thank you Dilip for your time on this.
And yes you are right in both your observations.
Please find the attached patch for the updated version.

-- 
Regards,
Rafia Sabih
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index eb560955cd..4dfa33ffa9 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -839,7 +839,10 @@ CREATE VIEW pg_stat_activity AS
 s.backend_xmin,
 S.query_id,
 S.query,
-S.backend_type
+S.backend_type,
+S.active_time,
+S.idle_in_transaction_time,
+S.idle_time
 FROM pg_stat_get_activity(NULL) AS S
 LEFT JOIN pg_database AS D ON (S.datid = D.oid)
 LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b7d0fbaefd..3e0eb963b3 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -249,6 +249,7 @@ PgStat_Counter pgStatBlockWriteTime = 0;
 static PgStat_Counter pgLastSessionReportTime = 0;
 PgStat_Counter pgStatActiveTime = 0;
 PgStat_Counter pgStatTransactionIdleTime = 0;
+PgStat_Counter pgStatTransactionIdleInTxnTime = 0;
 SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL;
 
 /* Record that's written to 2PC state file when pgstat state is persisted */
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 7229598822..2d7d3b6dce 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -577,6 +577,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 	 */
 	if ((beentry->st_state == STATE_RUNNING ||
 		 beentry->st_state == STATE_FASTPATH ||
+		 beentry->st_state == STATE_IDLE ||
 		 beentry->st_state == STATE_IDLEINTRANSACTION ||
 		 beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED) &&
 		state != beentry->st_state)
@@ -590,9 +591,21 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 
 		if (beentry->st_state == STATE_RUNNING ||
 			beentry->st_state == STATE_FASTPATH)
-			pgstat_count_conn_act

Re: Add connection active, idle time to pg_stat_activity

2021-11-09 Thread Rafia Sabih
On Tue, 2 Nov 2021 at 09:00, Dilip Kumar  wrote:
>
> On Tue, Oct 26, 2021 at 5:17 PM Rafia Sabih  wrote:
> >
> > >
> > > To provide this information I was digging into how the statistics
> > > collector is working and found out there is already information like
> > > total time that a connection is active as well as idle computed in
> > > pgstat_report_activity[1]. Ideally, this would be the values we would
> > > like to see per process in pg_stat_activity.
> > >
> > > Curious to know your thoughts on this.
>
> +1 for the idea
>
Thanks!

> > Please find the attached patch for the idea of our intentions.
> > It basically adds three attributes for idle, idle_in_transaction, and
> > active time respectively.
> > Please let me know your views on this and I shall add this to the
> > upcoming commitfest for better tracking.
>
> About the patch, IIUC earlier all the idle time was accumulated in the
> "pgStatTransactionIdleTime" counter, now with your patch you have
> introduced one more counter which specifically tracks the
> STATE_IDLEINTRANSACTION state.  But my concern is that the
> STATE_IDLEINTRANSACTION_ABORTED is still computed under STATE_IDLE and
> that looks odd to me.  Either STATE_IDLEINTRANSACTION_ABORTED should
> be accumulated in the "pgStatTransactionIdleInTxnTime" counter or
> there should be a separate counter for that.  But after your patch we
> can not accumulate this in the "pgStatTransactionIdleTime" counter.
>
As per your comments I have added it in pgStatTransactionIdleInTxnTime.
Please let me know if there are any further comments.

-- 
Regards,
Rafia Sabih
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index eb560955cd..4dfa33ffa9 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -839,7 +839,10 @@ CREATE VIEW pg_stat_activity AS
 s.backend_xmin,
 S.query_id,
 S.query,
-S.backend_type
+S.backend_type,
+S.active_time,
+S.idle_in_transaction_time,
+S.idle_time
 FROM pg_stat_get_activity(NULL) AS S
 LEFT JOIN pg_database AS D ON (S.datid = D.oid)
 LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b7d0fbaefd..3e0eb963b3 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -249,6 +249,7 @@ PgStat_Counter pgStatBlockWriteTime = 0;
 static PgStat_Counter pgLastSessionReportTime = 0;
 PgStat_Counter pgStatActiveTime = 0;
 PgStat_Counter pgStatTransactionIdleTime = 0;
+PgStat_Counter pgStatTransactionIdleInTxnTime = 0;
 SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL;
 
 /* Record that's written to 2PC state file when pgstat state is persisted */
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 7229598822..805cf7ae1e 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -590,9 +590,21 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 
 		if (beentry->st_state == STATE_RUNNING ||
 			beentry->st_state == STATE_FASTPATH)
-			pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
+		{
+pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
+beentry->st_active_time = pgStatActiveTime;
+		}
+		else if (beentry->st_state ==  STATE_IDLEINTRANSACTION ||
+ beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED)
+		{
+			pgstat_count_conn_txn_idle_in_txn_time((PgStat_Counter) secs * 100 + usecs);
+			beentry->st_idle_in_transaction_time = pgStatTransactionIdleInTxnTime;
+		}
 		else
+		{
 			pgstat_count_conn_txn_idle_time((PgStat_Counter) secs * 100 + usecs);
+			beentry->st_idle_time = pgStatTransactionIdleTime;
+		}
 	}
 
 	/*
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index ff5aedc99c..8044318533 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -567,7 +567,7 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS)
 Datum
 pg_stat_get_activity(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_ACTIVITY_COLS	30
+#define PG_STAT_GET_ACTIVITY_COLS	33
 	int			num_backends = pgstat_fetch_stat_numbackends();
 	int			curr_backend;
 	int			pid = PG_ARGISNULL(0) ? -1 : PG_GETARG_INT32(0);
@@ -916,6 +916,10 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 nulls[29] = true;
 			else
 values[29] = UInt64GetDatum(beentry->st_query_id);
+
+			values[30] = Int64GetDatum(beentry->st_active_time);
+			values[31] = Int64GetDatum(beentry->st_idle_in_transaction_time);
+			values[32] = Int64GetDatum(beentry->st_idl

Re: Add connection active, idle time to pg_stat_activity

2021-10-26 Thread Rafia Sabih
On Fri, 22 Oct 2021 at 10:22, Rafia Sabih  wrote:
>
> Hello there hackers,
>
> We at Zalando have faced some issues around long running idle
> transactions and were thinking about increasing the visibility of
> pg_stat_* views to capture them easily. What I found is that currently
> in pg_stat_activity there is a lot of good information about the
> current state of the process, but it is lacking the cumulative
> information on how much time the connection spent being idle, idle in
> transaction or active, we would like to see cumulative values for each
> of these per connection. I believe it would be helpful for us and more
> people out there if we could have total connection active and idle
> time displayed in pg_stat_activity.
>
> To provide this information I was digging into how the statistics
> collector is working and found out there is already information like
> total time that a connection is active as well as idle computed in
> pgstat_report_activity[1]. Ideally, this would be the values we would
> like to see per process in pg_stat_activity.
>
> Curious to know your thoughts on this.
>
> [1]https://github.com/postgres/postgres/blob/cd3f429d9565b2e5caf0980ea7c707e37bc3b317/src/backend/utils/activity/backend_status.c#L593
>
>
>
> --
> Regards,
> Rafia Sabih

Please find the attached patch for the idea of our intentions.
It basically adds three attributes for idle, idle_in_transaction, and
active time respectively.
Please let me know your views on this and I shall add this to the
upcoming commitfest for better tracking.


--
Regards,
Rafia Sabih
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 55f6e3711d..c721dbc0c5 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -835,7 +835,10 @@ CREATE VIEW pg_stat_activity AS
 s.backend_xmin,
 S.query_id,
 S.query,
-S.backend_type
+S.backend_type,
+S.active_time,
+S.idle_in_transaction_time,
+S.idle_time
 FROM pg_stat_get_activity(NULL) AS S
 LEFT JOIN pg_database AS D ON (S.datid = D.oid)
 LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b7d0fbaefd..3e0eb963b3 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -249,6 +249,7 @@ PgStat_Counter pgStatBlockWriteTime = 0;
 static PgStat_Counter pgLastSessionReportTime = 0;
 PgStat_Counter pgStatActiveTime = 0;
 PgStat_Counter pgStatTransactionIdleTime = 0;
+PgStat_Counter pgStatTransactionIdleInTxnTime = 0;
 SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL;
 
 /* Record that's written to 2PC state file when pgstat state is persisted */
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 7229598822..7ced0f738b 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -590,9 +590,20 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 
 		if (beentry->st_state == STATE_RUNNING ||
 			beentry->st_state == STATE_FASTPATH)
-			pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
+		{
+pgstat_count_conn_active_time((PgStat_Counter) secs * 100 + usecs);
+beentry->st_active_time = pgStatActiveTime;
+		}
+		else if (beentry->st_state ==  STATE_IDLEINTRANSACTION)
+		{
+			pgstat_count_conn_txn_idle_in_txn_time((PgStat_Counter) secs * 100 + usecs);
+			beentry->st_idle_in_transaction_time = pgStatTransactionIdleInTxnTime;
+		}
 		else
+		{
 			pgstat_count_conn_txn_idle_time((PgStat_Counter) secs * 100 + usecs);
+			beentry->st_idle_time = pgStatTransactionIdleTime;
+		}
 	}
 
 	/*
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index ff5aedc99c..8044318533 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -567,7 +567,7 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS)
 Datum
 pg_stat_get_activity(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_ACTIVITY_COLS	30
+#define PG_STAT_GET_ACTIVITY_COLS	33
 	int			num_backends = pgstat_fetch_stat_numbackends();
 	int			curr_backend;
 	int			pid = PG_ARGISNULL(0) ? -1 : PG_GETARG_INT32(0);
@@ -916,6 +916,10 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 nulls[29] = true;
 			else
 values[29] = UInt64GetDatum(beentry->st_query_id);
+
+			values[30] = Int64GetDatum(beentry->st_active_time);
+			values[31] = Int64GetDatum(beentry->st_idle_in_transaction_time);
+			values[32] = Int64GetDatum(beentry->st_idle_time);
 		}
 		else
 		{
@@ -944,6 +948,9 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 			nulls[27] = true;
 			nulls[28] = true;
 			nulls[29] = true;
+			values[30] = true;
+			values[31] = true;
+			

Add connection active, idle time to pg_stat_activity

2021-10-22 Thread Rafia Sabih
Hello there hackers,

We at Zalando have faced some issues around long running idle
transactions and were thinking about increasing the visibility of
pg_stat_* views to capture them easily. What I found is that currently
in pg_stat_activity there is a lot of good information about the
current state of the process, but it is lacking the cumulative
information on how much time the connection spent being idle, idle in
transaction or active, we would like to see cumulative values for each
of these per connection. I believe it would be helpful for us and more
people out there if we could have total connection active and idle
time displayed in pg_stat_activity.

To provide this information I was digging into how the statistics
collector is working and found out there is already information like
total time that a connection is active as well as idle computed in
pgstat_report_activity[1]. Ideally, this would be the values we would
like to see per process in pg_stat_activity.

Curious to know your thoughts on this.

[1]https://github.com/postgres/postgres/blob/cd3f429d9565b2e5caf0980ea7c707e37bc3b317/src/backend/utils/activity/backend_status.c#L593



-- 
Regards,
Rafia Sabih




Re: Position of ClientAuthentication hook

2021-06-16 Thread Rafia Sabih
On Mon, 14 Jun 2021 at 21:04, Robert Haas  wrote:
>
> On Mon, Jun 14, 2021 at 8:51 AM Rafia Sabih  wrote:
> > I have a doubt regarding the positioning of clientAuthentication hook
> > in function ClientAuthentication. Particularly, in case of hba errors,
> > i.e. cases uaReject or uaImplicitReject it errors out, leading to no
> > calls to any functions attached to the authentication hook. Can't we
> > process the hook function first and then error out...?
>
> Maybe. One potential problem is that if the hook errors out, the
> original error would be lost and only the error thrown by the hook
> would be logged or visible to the client. Whether or not that's a
> problem depends, I suppose, on what you're trying to do with the hook.

Thanks Robert for this quick clarification.


-- 
Regards,
Rafia Sabih




Position of ClientAuthentication hook

2021-06-14 Thread Rafia Sabih
Hello hackers,

I have a doubt regarding the positioning of clientAuthentication hook
in function ClientAuthentication. Particularly, in case of hba errors,
i.e. cases uaReject or uaImplicitReject it errors out, leading to no
calls to any functions attached to the authentication hook. Can't we
process the hook function first and then error out...?

-- 
Regards,
Rafia Sabih




Re: Parallel copy

2020-07-12 Thread Rafia Sabih
ne.
>
> > +static pg_attribute_always_inline void
> > +CopyReadBinaryAttributeLeader(CopyState cstate, FmgrInfo *flinfo,
> > +   Oid typioparam, int32 typmod, uint32 *new_block_pos,
> > +   int m, ParallelCopyTupleInfo *tuple_start_info_ptr,
> > +   ParallelCopyTupleInfo *tuple_end_info_ptr, uint32 *line_size)
> > I felt this function need not be an inline function.
>
> Yes. Changed.
>
> >
> > +   /* binary format */
> > +   /* for paralle copy leader, fill in the error
> > There are some typos, run spell check
>
> Done.
>
> >
> > +   /* raw_buf_index should never cross data block size,
> > +* as the required number of data blocks would have
> > +* been obtained in the above while loop.
> > +*/
> > There are few places, commenting style should be changed to postgres style
>
> Changed.
>
> >
> > +   if (cstate->pcdata->curr_data_block == NULL)
> > +   {
> > +   block_pos = WaitGetFreeCopyBlock(pcshared_info);
> > +
> > +   cstate->pcdata->curr_data_block =
> > _info->data_blocks[block_pos];
> > +
> > +   cstate->raw_buf_index = 0;
> > +
> > +   readbytes = CopyGetData(cstate,
> > >pcdata->curr_data_block->data, 1, DATA_BLOCK_SIZE);
> > +
> > +   elog(DEBUG1, "LEADER - bytes read from file %d", readbytes);
> > +
> > +   if (cstate->reached_eof)
> > +   return true;
> > +   }
> > There are many empty lines, these are not required.
> >
>
> Removed.
>
> >
> > +
> > +   fld_count = (int16) pg_ntoh16(fld_count);
> > +
> > +   CHECK_FIELD_COUNT;
> > +
> > +   cstate->raw_buf_index = cstate->raw_buf_index + sizeof(fld_count);
> > +   new_block_pos = pcshared_info->cur_block_pos;
> > You can run pg_indent once for the changes.
> >
>
> I ran pg_indent and observed that there are many places getting
> modified by pg_indent. If we need to run pg_indet on copy.c for
> parallel copy alone, then first, we need to run on plane copy.c and
> take those changes and then run for all parallel copy files. I think
> we better run pg_indent, for all the parallel copy patches once and
> for all, maybe just before we kind of finish up all the code reviews.
>
> > +   if (mode == 1)
> > +   {
> > +   cstate->pcdata->curr_data_block = data_block;
> > +   cstate->raw_buf_index = 0;
> > +   }
> > +   else if(mode == 2)
> > +   {
> > Could use macros for 1 & 2 for better readability.
>
> Done.
>
> >
> > +
> > +   if (following_block_id == -1)
> > +   break;
> > +   }
> > +
> > +   if (following_block_id != -1)
> > +
> > pg_atomic_add_fetch_u32(_info->data_blocks[following_block_id].unprocessed_line_parts,
> > 1);
> > +
> > +   *line_size = *line_size +
> > tuple_end_info_ptr->offset + 1;
> > +   }
> > We could calculate the size as we parse and identify one record, if we
> > do that way this can be removed.
> >
>
> Done.

Hi Bharath,

I was looking forward to review this patch-set but unfortunately it is
showing a reject in copy.c, and might need a rebase.
I was applying on master over the commit-
cd22d3cdb9bd9963c694c01a8c0232bbae3ddcfb.

-- 
Regards,
Rafia Sabih




Re: Columns correlation and adaptive query optimization

2020-03-25 Thread Rafia Sabih
Hello,

This sounded like an interesting addition to postgresql. I gave some
time to it today to review, here are few comments,

On Wed, 25 Mar 2020 at 14:28, Konstantin Knizhnik
 wrote:
>
>
>
> On 25.03.2020 16:00, David Steele wrote:
> > On 3/25/20 6:57 AM, Konstantin Knizhnik wrote:
> >>
> >>
> >> On 24.03.2020 20:12, David Steele wrote:
> >>> On 12/24/19 3:15 AM, Konstantin Knizhnik wrote:
> >>>> New version of patch implicitly adding multicolumn statistic in
> >>>> auto_explain extension and using it in optimizer for more precise
> >>>> estimation of join selectivity.
> >>>> This patch fixes race condition while adding statistics and
> >>>> restricts generated statistic name to fit in 64 bytes (NameData).
> >>>
> >>> This patch no longer applies:
> >>> https://commitfest.postgresql.org/27/2386/
> >>>
> >>> The CF entry has been updated to Waiting on Autho
> >>
> >> Rebased patch is attached.
> >
> > The patch applies now but there are error on Windows and Linux:
> > https://ci.appveyor.com/project/postgresql-cfbot/postgresql/build/1.0.85481
> >
> > https://travis-ci.org/postgresql-cfbot/postgresql/builds/666729979
> >
> > Regards,
>
> Sorry, yet another patch is attached.
>
> --
> Konstantin Knizhnik
> Postgres Professional: http://www.postgrespro.com
> The Russian Postgres Company
>

+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+

This doesn't look like the right place for it, you might want to
declare it with other functions in the starting of the file.

Also, there is no description about any of the functions here,
wouldn’t hurt having some more comments there.

A few of more questions that cross my mind at this point,

- have you tried measuring the extra cost we have to pay for this
mores statistics , and also compare it with the benefit it gives in
terms of accuracy.
- I would also be interested in understanding if there are cases when
adding this extra step doesn’t help and have you excluded them already
or if some of them are easily identifiable at this stage...?
- is there any limit  on the number of columns for which this will
work, or should there be any such limit...?

-- 
Regards,
Rafia Sabih




Re: adding partitioned tables to publications

2020-01-28 Thread Rafia Sabih
Hi Amit,

Once again I went through this patch set and here are my few comments,

On Thu, 23 Jan 2020 at 11:10, Amit Langote  wrote:
>
> On Wed, Jan 22, 2020 at 2:38 PM Amit Langote  wrote:
> > Other than that, the updated patch contains following significant changes:
> >
> > * Changed pg_publication.c: GetPublicationRelations() so that any
> > published partitioned tables are expanded as needed
> >
> > * Since the pg_publication_tables view is backed by
> > GetPublicationRelations(), that means subscriptioncmds.c:
> > fetch_table_list() no longer needs to craft a query to include
> > partitions when needed, because partitions are included at source.
> > That seems better, because it allows to limit the complexity
> > surrounding publication of partitioned tables to the publication side.
> >
> > * Fixed the publication table DDL to spot more cases of tables being
> > added to a publication in a duplicative manner.  For example,
> > partition being added to a publication which already contains its
> > ancestor and a partitioned tables being added to a publication
> > (implying all of its partitions are added) which already contains a
> > partition
>
> On second thought, this seems like an overkill.  It might be OK after
> all for both a partitioned table and its partitions to be explicitly
> added to a publication without complaining of duplication. IOW, it's
> the user's call whether it makes sense to do that or not.
>
> > Only attaching 0001.
>
> Attached updated 0001 considering the above and the rest of the
> patches that add support for replicating partitioned tables using
> their own identity and schema.  I have reorganized the other patches
> as follows:
>
> 0002: refactoring of logical/worker.c without any functionality
> changes (contains much less churn than in earlier versions)
>
> 0003: support logical replication into partitioned tables on the
> subscription side (allows replicating from a non-partitioned table on
> publisher node into a partitioned table on subscriber node)
>
> 0004: support optionally replicating partitioned table changes (and
> changes directly made to partitions) using root partitioned table
> identity and schema

+ cannot replicate from a regular table into a partitioned able or vice
Here is a missing t from table.

+ 
+  When a partitioned table is added to a publication, all of its existing
+  and future partitions are also implicitly considered to be part of the
+  publication.  So, even operations that are performed directly on a
+  partition are also published via its ancestors' publications.

Now this is confusing, does it mean that when partitions are later
added to the table they will be replicated too, I think not, because
you need to first create them manually at the replication side, isn't
it...?

+ /* Must be a regular or partitioned table */
+ if (RelationGetForm(targetrel)->relkind != RELKIND_RELATION &&
+ RelationGetForm(targetrel)->relkind != RELKIND_PARTITIONED_TABLE)
  ereport(ERROR,
  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  errmsg("\"%s\" is not a table",

IMHO the error message and details should be modified here to
something along the lines of 'is neither a regular or partitioned
table'

+ * published via an ancestor and when a partitioned tables's partitions
tables's --> tables'

+ if (get_rel_relispartition(relid))
+ {
+ List*ancestors = get_partition_ancestors(relid);

Now, this is just for my understanding, why the ancestors have to be a
list, I always assumed that a partition could only have one ancestor
-- the root table. Is there something more to it that I am totally
missing here or is it to cover the scenario of having partitions of
partitions.

Here I also want to clarify one thing, does it also happen like if a
partitioned table is dropped from a publication then all its
partitions are also implicitly dropped? As far as my understanding
goes that doesn't happen, so shouldn't there be some notice about it.

-GetPublicationRelations(Oid pubid)
+GetPublicationRelations(Oid pubid, bool include_partitions)

How about having an enum here with INCLUDE_PARTITIONS,
INCLUDE_PARTITIONED_REL, and SKIP_PARTITIONS to address the three
possibilities and avoiding reiterating through the list in
pg_get_publication_tables().

-- 
Regards,
Rafia Sabih




Re: [Logical Replication] TRAP: FailedAssertion("rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || rel->rd_rel->relreplident == REPLICA_IDE

2020-01-09 Thread Rafia Sabih
On Thu, 9 Jan 2020 at 08:48, Dilip Kumar  wrote:
>
> On Thu, Jan 9, 2020 at 12:50 PM Neha Sharma
>  wrote:
> >
> > Hi Michael,
> > Thanks for looking into the issue. Sorry by mistake I had mentioned the 
> > incorrect DML query,please use the query as mentioned below.
> >
> > On Thu, Jan 9, 2020 at 11:38 AM Michael Paquier  wrote:
> >>
> >> On Tue, Jan 07, 2020 at 05:38:49PM +0530, Neha Sharma wrote:
> >> > I am getting a server crash on publication server on HEAD for the below
> >> > test case.
> >> >
> >> > Test case:
> >> > Publication server:
> >> > create table test(a int);
> >> > create publication test_pub for all tables;
> >> > alter table test replica identity NOTHING ;
> >> >
> >> > Subscription server:
> >> > create table test(a int);
> >> > create subscription test_sub CONNECTION 'host=172.16.208.32 port=5432
> >> > dbname=postgres user=centos' PUBLICATION test_pub WITH ( slot_name =
> >> > test_slot_sub);
> >> >
> >> > Publication server:
> >> > insert into test values(generate_series(1,5),'aa');
> >
> >  insert into test values(generate_series(1,5));
> >>
> >>
> >> This would not work as your relation has only one column.  There are
> >> some TAP tests for logical replication (none actually stressing
> >> NOTHING as replica identity), which do not fail, and I cannot
> >> reproduce the failure myself.
>
> I am able to reproduce the failure,  I think the assert in the
> 'logicalrep_write_insert' is not correct.  IMHO even if the replica
> identity is set to NOTHING we should be able to replicate INSERT?
>
True that.
> This will fix the issue.
>
> diff --git a/src/backend/replication/logical/proto.c
> b/src/backend/replication/logical/proto.c
> index dcf7c08..471461c 100644
> --- a/src/backend/replication/logical/proto.c
> +++ b/src/backend/replication/logical/proto.c
> @@ -145,7 +145,8 @@ logicalrep_write_insert(StringInfo out, Relation
> rel, HeapTuple newtuple)
>
> Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT ||
>rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL ||
> -  rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX);
> +  rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX ||
> +  rel->rd_rel->relreplident == REPLICA_IDENTITY_NOTHING);
>
> /* use Oid as relation identifier */
> pq_sendint32(out, RelationGetRelid(rel));
>
+1



-- 
Regards,
Rafia Sabih




Re: adding partitioned tables to publications

2020-01-07 Thread Rafia Sabih
On Tue, 7 Jan 2020 at 06:02, Amit Langote  wrote:

> Rebased and updated to address your comments.
>
+  
+   Partitioned tables are not considered when FOR ALL TABLES
+   is specified.
+  
+
What is the reason for above, I mean not for the comment but not
including partitioned tables in for all tables options.


-- 
Regards,
Rafia Sabih




Re: adding partitioned tables to publications

2020-01-06 Thread Rafia Sabih
Hi Amit,

I went through this patch set once again today and here are my two cents.

On Mon, 16 Dec 2019 at 10:19, Amit Langote  wrote:
>
> Attached updated patches.
- differently partitioned setup.  Attempts to replicate tables other than
- base tables will result in an error.
+ Replication is only supported by regular and partitioned tables, although
+ the table kind must match between the two servers, that is, one cannot

I find the phrase 'table kind' a bit odd, how about something like
type of the table.

/* Only plain tables can be aded to publications. */
- if (tbinfo->relkind != RELKIND_RELATION)
+ /* Only plain and partitioned tables can be added to publications. */
IMHO using regular instead of plain would be more consistent.

+ /*
+ * Find a partition for the tuple contained in remoteslot.
+ *
+ * For insert, remoteslot is tuple to insert.  For update and delete, it
+ * is the tuple to be replaced and deleted, repectively.
+ */
Misspelled 'respectively'.

+static void
+apply_handle_tuple_routing(ResultRelInfo *relinfo,
+LogicalRepRelMapEntry *relmapentry,
+EState *estate, CmdType operation,
+TupleTableSlot *remoteslot,
+LogicalRepTupleData *newtup)
+{
+ Relation rel = relinfo->ri_RelationDesc;
+ ModifyTableState *mtstate = NULL;
+ PartitionTupleRouting *proute = NULL;
+ ResultRelInfo *partrelinfo,
+*partrelinfo1;

IMHO, partrelinfo1 can be better named to improve readability.

Otherwise, as Dilip already mentioned, there is a rebase required
particularly for 0003 and 0004.

-- 
Regards,
Rafia Sabih




Re: Minor comment fixes for instrumentation.h

2019-12-05 Thread Rafia Sabih
On Thu, 5 Dec 2019 at 14:45, Robert Haas  wrote:
>
> On Tue, Dec 3, 2019 at 8:36 AM Rafia Sabih  wrote:
> > While going through this file I noticed some inconsistencies in the
> > comments. Please find attachment for the fix.
>
> Committed. I think only the duplicated word is a clear error, but the
> other changes seem like mild improvements, so pushed the whole thing.

True and thank you.

-- 
Regards,
Rafia Sabih




Minor comment fixes for instrumentation.h

2019-12-03 Thread Rafia Sabih
Hello,

While going through this file I noticed some inconsistencies in the
comments. Please find attachment for the fix.

-- 
Regards,
Rafia Sabih
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 70d8632305..13aaf05453 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -48,20 +48,20 @@ typedef struct Instrumentation
 	bool		need_bufusage;	/* true if we need buffer usage data */
 	/* Info about current plan cycle: */
 	bool		running;		/* true if we've completed first tuple */
-	instr_time	starttime;		/* Start time of current iteration of node */
-	instr_time	counter;		/* Accumulated runtime for this node */
-	double		firsttuple;		/* Time for first tuple of this cycle */
-	double		tuplecount;		/* Tuples emitted so far this cycle */
-	BufferUsage bufusage_start; /* Buffer usage at start */
+	instr_time	starttime;		/* start time of current iteration of node */
+	instr_time	counter;		/* accumulated runtime for this node */
+	double		firsttuple;		/* time for first tuple of this cycle */
+	double		tuplecount;		/* # of tuples emitted so far this cycle */
+	BufferUsage bufusage_start; /* buffer usage at start */
 	/* Accumulated statistics across all completed cycles: */
-	double		startup;		/* Total startup time (in seconds) */
-	double		total;			/* Total total time (in seconds) */
-	double		ntuples;		/* Total tuples produced */
-	double		ntuples2;		/* Secondary node-specific tuple counter */
+	double		startup;		/* total startup time (in seconds) */
+	double		total;			/* total time (in seconds) */
+	double		ntuples;		/* total tuples produced */
+	double		ntuples2;		/* secondary node-specific tuple counter */
 	double		nloops;			/* # of run cycles for this node */
-	double		nfiltered1;		/* # tuples removed by scanqual or joinqual */
-	double		nfiltered2;		/* # tuples removed by "other" quals */
-	BufferUsage bufusage;		/* Total buffer usage */
+	double		nfiltered1;		/* # of tuples removed by scanqual or joinqual */
+	double		nfiltered2;		/* # of tuples removed by "other" quals */
+	BufferUsage bufusage;		/* total buffer usage */
 } Instrumentation;
 
 typedef struct WorkerInstrumentation


Re: How to prohibit parallel scan through tableam?

2019-11-28 Thread Rafia Sabih
On Wed, 27 Nov 2019 at 12:33, Konstantin Knizhnik 
wrote:

> Hi hackers,
>
> I wonder how it is possible to prohibit parallel scan for the external
> storage accessed through tableam?
> For example if I want to implement specialized tableam for fast access
> to temp tables, how can I inform optimizer that
> parallel scan is not possible (because table data is local to the backend)?
>
> One moment, isn't that parallel scans are already restricted for temp
tables, or I have misunderstood something here...?



-- 
Regards,
Rafia Sabih


Re: How to prohibit parallel scan through tableam?

2019-11-27 Thread Rafia Sabih
On Wed, 27 Nov 2019 at 12:33, Konstantin Knizhnik 
wrote:

> Hi hackers,
>
> I wonder how it is possible to prohibit parallel scan for the external
> storage accessed through tableam?
> For example if I want to implement specialized tableam for fast access
> to temp tables, how can I inform optimizer that
> parallel scan is not possible (because table data is local to the backend)?
>
>  How about setting parallel_setup_cost to disable_cost in costsize.c for
your specific scan method.

-- 
Regards,
Rafia Sabih


Re: Performance improvement for queries with IN clause

2019-11-11 Thread Rafia Sabih
On Sat, 9 Nov 2019 at 12:52, Andreas Karlsson  wrote:

> On 11/8/19 2:52 PM, Rafia Sabih wrote:
> > Now, my question is shouldn't we always use this list in sorted order,
> > in other words can there be scenarios where such a sorting will not
> > help? I am talking about only the cases where the list consists of all
> > constants and could fit in memory. Basically, when we are
> > transforming the in expression and found that it consists of all
> > constants, then sort it as well, codewise at transfromAExprIn, of course
> > there might be better ways to accomplish this.
> >
> > So, your thoughts, opinions, suggestions are more than welcome.
>
> If it is worth sorting them should depend on the index, e.g. for hash
> indexes sorting would just be a waste of time.


Hi Andreas,
Thanks for your response. Here, I meant this list sorting only for Btree,
as you well pointed out that for other indexes like hash this wouldn't
really make sense.


-- 
Regards,
Rafia Sabih


Performance improvement for queries with IN clause

2019-11-08 Thread Rafia Sabih
Hello all,

I would like to direct your attention to the queries of following type,
select 
from 
where  IN ()

the plan for such a query uses index scan (or index-only), now in our
experiments, if the provided list is sorted then query performance
improves by ~10%. Which makes sense also as once we have found the required
btree leaf we just keep moving in one direction, which should be
expectantly less time consuming than searching the tree again.

Now, my question is shouldn't we always use this list in sorted order, in
other words can there be scenarios where such a sorting will not help? I am
talking about only the cases where the list consists of all constants and
could fit in memory. Basically, when we are transforming the in expression
and found that it consists of all constants, then sort it as well, codewise
at transfromAExprIn, of course there might be better ways to accomplish
this.

So, your thoughts, opinions, suggestions are more than welcome.

-- 
Regards,
Rafia Sabih


Re: Parallel leader process info in EXPLAIN

2019-11-07 Thread Rafia Sabih
On Mon, 4 Nov 2019 at 00:30, Thomas Munro  wrote:

> On Mon, Nov 4, 2019 at 12:11 PM Thomas Munro 
> wrote:
> > I guess I thought of that as a debugging feature and took it out
> > because it was too verbose, but maybe it just needs to be controlled
> > by the VERBOSE switch.  Do you think we should put that back?
>
> By which I mean: would you like to send a patch?  :-)
>
> Here is a new version of the "Leader:" patch, because cfbot told me
> that gcc didn't like it as much as clang.
>

I was reviewing this patch and here are a few comments,

+static void
+ExplainNodePerProcess(ExplainState *es, bool *opened_group,
+  int worker_number, Instrumentation *instrument)
+{

A small description about this routine would be helpful and will give the
file a consistent look.

Also, I noticed that the worker details are displayed for sort node even
without verbose, but for scans it is only with verbose. Am I missing
something or there is something behind? However, I am not sure if this is
the introduced by this patch-set.

-- 
Regards,
Rafia Sabih


Re: adding partitioned tables to publications

2019-11-04 Thread Rafia Sabih
Hi Amit,

On Fri, 11 Oct 2019 at 08:06, Amit Langote  wrote:

>
> Thanks for sharing this case.  I hadn't considered it, but you're
> right that it should be handled sensibly.  I have fixed table sync
> code to handle this case properly.  Could you please check your case
> with the attached updated patch?
>
> I was checking this today and found that the behavior doesn't change much
with the updated patch. The tables are still replicated, just that a select
count from parent table shows 0, rest of the partitions including default
one has the data from the publisher. I was expecting more like an error at
subscriber saying the table type is not same.

Please find the attached file for the test case, in case something is
unclear.

-- 
Regards,
Rafia Sabih
-- publisher
create table t (i int, j int, k text) partition by range(i);
create table child1 partition of t for values from ( 1) to (100);
create table child2 partition of t for values from (100) to (200);
create table child3 partition of t for values from (200) to (300);
create table child4 partition of t for values from (300) to (400);
create table child5 partition of t for values from (400) to (500);
create table def partition of t DEFAULT;

create publication mypub;
alter publication mypub add table t;

insert into t values (generate_series(1,500), generate_series(1,500), 
repeat('jqbsuyt7832edjw', 20));

--subscriber
create table t (i int, j int, k text);
create table child1 (i int, j int, k text);
create table child5 (i int, j int, k text);
create table child4 (i int, j int, k text);
create table child3 (i int, j int, k text);
create table child2 (i int, j int, k text);
create table def (i int, j int, k text);

create subscription mysub connection 'host=localhost port=5432 dbname=postgres' 
publication mypub;



Re: adding partitioned tables to publications

2019-10-10 Thread Rafia Sabih
On Thu, 10 Oct 2019 at 08:29, Amit Langote  wrote:

> On Mon, Oct 7, 2019 at 9:55 AM Amit Langote 
> wrote:
> > One cannot currently add partitioned tables to a publication.
> >
> > create table p (a int, b int) partition by hash (a);
> > create table p1 partition of p for values with (modulus 3, remainder 0);
> > create table p2 partition of p for values with (modulus 3, remainder 1);
> > create table p3 partition of p for values with (modulus 3, remainder 2);
> >
> > create publication publish_p for table p;
> > ERROR:  "p" is a partitioned table
> > DETAIL:  Adding partitioned tables to publications is not supported.
> > HINT:  You can add the table partitions individually.
> >
> > One can do this instead:
> >
> > create publication publish_p1 for table p1;
> > create publication publish_p2 for table p2;
> > create publication publish_p3 for table p3;
> >
> > but maybe that's too much code to maintain for users.
> >
> > I propose that we make this command:
> >
> > create publication publish_p for table p;
> >
> > automatically add all the partitions to the publication.  Also, any
> > future partitions should also be automatically added to the
> > publication.  So, publishing a partitioned table automatically
> > publishes all of its existing and future partitions.  Attached patch
> > implements that.
> >
> > What doesn't change with this patch is that the partitions on the
> > subscription side still have to match one-to-one with the partitions
> > on the publication side, because the changes are still replicated as
> > being made to the individual partitions, not as the changes to the
> > root partitioned table.  It might be useful to implement that
> > functionality on the publication side, because it allows users to
> > define the replication target any way they need to, but this patch
> > doesn't implement that.
>
> Added this to the next CF: https://commitfest.postgresql.org/25/2301/
>
> Hi Amit,

Lately I was exploring logical replication feature of postgresql and I
found this addition in the scope of feature for partitioned tables a useful
one.

In order to understand the working of your patch a bit more, I performed an
experiment wherein I created a partitioned table with several  children and
a default partition at the publisher side and normal tables of the same
name as parent, children, and default partition of the publisher side at
the subscriber side. Next I established the logical replication connection
and to my surprise the data was successfully replicated from partitioned
tables to normal tables and then this error filled the logs,
LOG:  logical replication table synchronization worker for subscription
"my_subscription", table "parent" has started
ERROR:  table "public.parent" not found on publisher

here parent is the name of the partitioned table at the publisher side and
it is present as normal table at subscriber side as well. Which is
understandable, it is trying to find a normal table of the same name but
couldn't find one, maybe it should not worry about that now also if not at
replication time.

Please let me know if this is something expected because in my opinion this
is not desirable, there should be some check to check the table type for
replication. This wasn't important till now maybe because only normal
tables were to be replicated, but with the extension of the scope of
logical replication to more objects such checks would be helpful.

On a separate note was thinking for partitioned tables, wouldn't it be
cleaner to have something like you create only partition table at the
subscriber and then when logical replication starts it creates the child
tables accordingly. Or would that be too much in future...?

-- 
Regards,
Rafia Sabih


Re: pgbench - allow to create partitioned tables

2019-10-01 Thread Rafia Sabih
On Tue, 1 Oct 2019 at 16:48, Fabien COELHO  wrote:

>
> >> Yeah, I know that, but this doesn't look quite right.  I mean to say
> >> whatever we want to say via this message is correct, but I am not
> >> completely happy with the display part.  How about something like:
> >> "pgbench_accounts is missing, you need to do initialization (\"pgbench
> >> -i\") in database \"%s\"\n"?  Feel free to propose something else on
> >> similar lines?  If possible, I want to convey this information in a
> single
> >> sentence.
> >>
> >> How about, "pgbench_accounts is missing, initialize (\"pgbench -i\") in
> > database \"%s\"\n"?
>
> I think that we should not presume too much about the solution: perhaps
> the user did not specify the right database or host and it has nothing to
> do with initialization.
>
> Maybe something like:
>
> "pgbench_accounts is missing, perhaps you need to initialize (\"pgbench
> -i\") in database \"%s\"\n"
>
> The two sentences approach has the logic of "error" and a separate "hint"
> which is often used.
>

+1 for error and hint separation.


-- 
Regards,
Rafia Sabih


Re: pgbench - allow to create partitioned tables

2019-10-01 Thread Rafia Sabih
On Tue, 1 Oct 2019 at 15:39, Amit Kapila  wrote:

> On Tue, Oct 1, 2019 at 11:51 AM Fabien COELHO  wrote:
>
>>
>> Hello Amit,
>>
>> > 1. ran pgindent
>> > 2. As per Alvaro's suggestions move few function definitions.
>> > 3. Changed one or two comments and fixed spelling at one place.
>>
>> Thanks for the improvements.
>>
>> Not sure why you put "XXX - " in front of "append_fillfactor" comment,
>> though.
>>
>>
> It is to indicate that we can do this after further consideration.
>
>
>> > + fprintf(stderr,
>> > + "no pgbench_accounts table found in search_path\n"
>> > + "Perhaps you need to do initialization (\"pgbench -i\") in database
>> > \"%s\"\n", PQdb(con));
>>
>> > Can anyone else think of a better error message either in wording or
>> > style for above case?
>>
>> No better idea from me. The second part is a duplicate from a earlier
>> comment, when getting the scale fails.
>>
>
> Yeah, I know that, but this doesn't look quite right.  I mean to say
> whatever we want to say via this message is correct, but I am not
> completely happy with the display part.  How about something like:
> "pgbench_accounts is missing, you need to do initialization (\"pgbench
> -i\") in database \"%s\"\n"?  Feel free to propose something else on
> similar lines?  If possible, I want to convey this information in a single
> sentence.
>
> How about, "pgbench_accounts is missing, initialize (\"pgbench -i\") in
database \"%s\"\n"?

>
-- 
Regards,
Rafia Sabih


Re: [PATCH] Incremental sort (was: PoC: Partial sort)

2019-09-04 Thread Rafia Sabih
On Tue, 30 Jul 2019 at 02:17, Tomas Vondra 
wrote:

> On Sun, Jul 21, 2019 at 01:34:22PM +0200, Tomas Vondra wrote:
> >
> > ...
> >
> >I wonder if we're approaching this wrong. Maybe we should not reverse
> >engineer queries for the various places, but just start with a set of
> >queries that we want to optimize, and then identify which places in the
> >planner need to be modified.
> >
>
> I've decided to do a couple of experiments, trying to make my mind about
> which modified places matter to diffrent queries. But instead of trying
> to reverse engineer the queries, I've taken a different approach - I've
> compiled a list of queries that I think are sensible and relevant, and
> then planned them with incremental sort enabled in different places.
>
> I don't have any clear conclusions at this point - it does show some of
> the places don't change plan for any of the queries, although there may
> be some additional query where it'd make a difference.
>
> But I'm posting this mostly because it might be useful. I've initially
> planned to move changes that add incremental sort paths to separate
> patches, and then apply/skip different subsets of those patches. But
> then I realized there's a better way to do this - I've added a bunch of
> GUCs, one for each such place. This allows doing this testing without
> having to rebuild repeatedly.
>
> I'm not going to post the patch(es) with extra GUCs here, because it'd
> just confuse the patch tester, but it's available here:
>
>   https://github.com/tvondra/postgres/tree/incremental-sort-20190730
>
> There are 10 GUCs, one for each place in planner where incremental sort
> paths are constructed. By default all those are set to 'false' so no
> incremental sort paths are built. If you do
>
>   SET devel_create_ordered_paths = on;
>
> it'll start creating the paths in non-parallel in create_ordered_paths.
> Then you may enable devel_create_ordered_paths_parallel to also consider
> parallel paths, etc.
>
> The list of queries (synthetic, but hopefully sufficiently realistic)
> and a couple of scripts to collect the plans is in this repository:
>
>   https://github.com/tvondra/incremental-sort-tests-2
>
> There's also a spreadsheet with a summary of results, with a visual
> representation of which GUCs affect which queries.
>
> Wow, that sounds like an elaborate experiment. But where is this
spreadsheet you mentioned ?

-- 
Regards,
Rafia Sabih


Re: Creating partitions automatically at least on HASH?

2019-08-27 Thread Rafia Sabih
On Mon, 26 Aug 2019 at 19:46, Fabien COELHO  wrote:

>
> Hello Rafia,
>
> >>CREATE TABLE Stuff (...)
> >>  PARTITION BY [HASH | RANGE | LIST] (…)
> >>DO NONE -- this is the default
> >>DO [IMMEDIATE|DEFERRED] USING (…)
> >>
> >> Where the USING part would be generic keword value pairs, eg:
> >>
> >> For HASH: (MODULUS 8) and/or (NPARTS 10)
> >>
> >> For RANGE: (START '1970-01-01', STOP '2020-01-01', INCREMENT '1 year')
> >>  and/or (START 1970, STOP 2020, NPARTS 50)
> >>
> >> And possibly for LIST: (IN (…), IN (…), …), or possibly some other
> >> keyword.
> >>
> >> The "DEFERRED" could be used as an open syntax for dynamic partitioning,
> >> if later someone would feel like doing it.
> >>
> > ISTM that "USING" is better than "WITH" because WITH is already used
> >> specifically for HASH and other optional stuff in CREATE TABLE.
> >>
> >> The text constant would be interpreted depending on the partitioning
> >> expression/column type.
> >>
> >> Any opinion about the overall approach?
>
> > I happen to start a similar discussion [1] being unaware of this one
> > and there Ashutosh Sharma talked about interval partitioning in Oracle.
> > Looking
> > closely it looks like we can have this automatic partitioning more
> > convenient by having something similar. Basically, it is creating
> > partitions on demand or lazy partitioning.
>
> Yep, the "what" of dynamic partitioning is more or less straightforward,
> along the line you are describing.
>
> For me there are really two questions:
>
>   - having a extendable syntax, hence the mail I sent, which would cover
> both automatic static & dynamic partitioning and their parameters,
> given that we already have manual static, automatic static should
> be pretty easy.
>
>   - implementing the stuff, with limited performance impact if possible
> for the dynamic case, which is non trivial.
>
> > To explain a bit more, let's take range partition for example, first
> > parent table is created and it's interval and start and end values are
> > specified and it creates only the parent table just like it works today.
>
> > Now, if there comes a insertion that does not belong to the existing (or
> > any, in the case of first insertion) partition(s), then the
> > corresponding partition is created,
>
> Yep. Now, you also have to deal with race conditions issues, i.e. two
> parallel session inserting tuples that must create the same partition, and
> probably you would like to avoid a deadlock.
>
> Hmmm, that shouldn't be very hard. Postgres handles many such things and I
think mostly by a mutex guarded shared memory structure. E.g. we can have a
shared memory structure associated with the parent table holding the
information of all the available partitions, and keep this structure
guarded by mutex. Anytime a new partition has to be created the relevant
information is first entered in this structure before actually creating it.

> I think it is extensible to other partitioning schemes as well. Also it
> > is likely to have a positive impact on the queries, because there will
> > be required partitions only and would not require to educate
> > planner/executor about many empty partitions.
>
> Yep, but it creates other problems to solve…
>
> Isn't it always the case. :)

-- 
Regards,
Rafia Sabih


Re: Creating partitions automatically at least on HASH?

2019-08-26 Thread Rafia Sabih
On Sun, 18 Aug 2019 at 11:33, Fabien COELHO  wrote:

>
> Hello Robert & Robert,
>
> > - no partitions are created immediately (current case)
> >   but will have to be created manually later
> >
> > - static partitions are created automatically, based on provided
> >   parameters
> >
> > - dynamic partitions will be created later, when needed, based
> >   on provided parameters again.
> >
> > Even if all that is not implemented immediately.
> >
> >> We need something that will let you specify just a modulus for hash
> >> partitions, a start, end, and interval for range partitions, and a list
> of
> >> bounds for list partitions.  If we're willing to create a new keyword,
> we
> >> could make PARTITIONS a keyword. Then:
> >>
> >> PARTITION BY HASH (whatever) PARTITIONS 8
> >
> > I think that it should reuse already existing keywords, i.e. MODULUS
> should
> > appear somewhere.
> >
> > Maybe:
> >
> >  ... PARTITION BY HASH (whatever)
> >  [ CREATE [IMMEDIATE | DEFERRED] PARTITIONS (MODULUS 8) |
> >NOCREATE or maybe NO CREATE ];
>
> I have given a small go at the parser part of that.
>
> There are 3 types of partitions with 3 dedicated syntax structures to
> handle their associated parameters (WITH …, FROM … TO …, IN …). ISTM that
> it is a "looks good from far away" idea, but when trying to extend that it
> is starting to be a pain. If a 4th partition type is added, should it be
> yet another syntax? So I'm looking for an generic and extensible syntax
> that could accomodate all cases for automatic creation of partitions.
>
> Second problem, adding a "CREATE" after "PARTITION BY … (…)" create
> shift-reduce conflicts with potential other CREATE TABLE option
> specification syntax. Not sure which one, but anyway. So the current
> generic syntax I'm considering is using "DO" as a trigger to start the
> optional automatic partition creation stuff:
>
>CREATE TABLE Stuff (...)
>  PARTITION BY [HASH | RANGE | LIST] (…)
>DO NONE -- this is the default
>DO [IMMEDIATE|DEFERRED] USING (…)
>
> Where the USING part would be generic keword value pairs, eg:
>
> For HASH: (MODULUS 8) and/or (NPARTS 10)
>
> For RANGE: (START '1970-01-01', STOP '2020-01-01', INCREMENT '1 year')
>  and/or (START 1970, STOP 2020, NPARTS 50)
>
> And possibly for LIST: (IN (…), IN (…), …), or possibly some other
> keyword.
>
> The "DEFERRED" could be used as an open syntax for dynamic partitioning,
> if later someone would feel like doing it.
>
ISTM that "USING" is better than "WITH" because WITH is already used
> specifically for HASH and other optional stuff in CREATE TABLE.
>
> The text constant would be interpreted depending on the partitioning
> expression/column type.
>
> Any opinion about the overall approach?
>
>
> I happen to start a similar discussion [1] being unaware of this one and
there Ashutosh Sharma talked about interval partitioning in Oracle. Looking
closely it looks like we can have this automatic partitioning more
convenient by having something similar. Basically, it is creating
partitions on demand or lazy partitioning. To explain a bit more, let's
take range partition for example, first parent table is created and it's
interval and start and end values are specified and it creates only the
parent table just like it works today. Now,  if there comes a insertion
that does not belong to the existing (or any, in the case of first
insertion) partition(s), then the corresponding partition is created, I
think it is extensible to other partitioning schemes as well. Also it is
likely to have a positive impact on the queries, because there will be
required partitions only and would not require to educate planner/executor
about many empty partitions.

[1]
https://www.postgresql.org/message-id/flat/20190820205005.GA25823%40alvherre.pgsql#c67245b98e2cfc9c3bd261f134d05368

-- 
Regards,
Rafia Sabih


Improve default partition

2019-08-20 Thread Rafia Sabih
Hello all,

I was just playing around with table partitioning and noticed
1. When one inserts into a parent table with no partitions defined
yet, it errors out
2. Similarly, if we try to insert into a parent table a value which is
not covered in any partition and has no default partition defined, it
errors out

This does not sound very convenient. I was thinking of having some
mechanism for such insertions which automatically creates a default
partition and gives a notice for the user to know that it is going to
the default partition. Basically, always having a default partition.
After all default is something that remains there by default, isn't
it?

I will be happy to know your thoughts on this.

-- 
Regards,
Rafia Sabih




Re: Resume vacuum and autovacuum from interruption and cancellation

2019-08-08 Thread Rafia Sabih
On Tue, 16 Jul 2019 at 13:57, Masahiko Sawada  wrote:
>
> On Wed, Jun 12, 2019 at 1:30 PM Masahiko Sawada  wrote:
> >
> > Hi all,
> >
> > Long-running vacuum could be sometimes cancelled by administrator. And
> > autovacuums could be cancelled by concurrent processes. Even if it
> > retries after cancellation, since it always restart from the first
> > block of table it could vacuums blocks again that we vacuumed last
> > time. We have visibility map to skip scanning all-visible blocks but
> > in case where the table is large and often modified, we're more likely
> > to reclaim more garbage from blocks other than we processed last time
> > than scanning from the first block.
> >
> > So I'd like to propose to make vacuums save its progress and resume
> > vacuuming based on it. The mechanism I'm thinking is simple; vacuums
> > periodically report the current block number to the stats collector.
> > If table has indexes, reports it after heap vacuum whereas reports it
> > every certain amount of blocks (e.g. 1024 blocks = 8MB) if no indexes.
> > We can see that value on new column vacuum_resume_block of
> > pg_stat_all_tables. I'm going to add one vacuum command option RESUME
> > and one new reloption vacuum_resume. If the option is true vacuums
> > fetch the block number from stats collector before starting and start
> > vacuuming from that block. I wonder if we could make it true by
> > default for autovacuums but it must be false when aggressive vacuum.
> >
> > If we start to vacuum from not first block, we can update neither
> > relfrozenxid nor relfrozenxmxid. And we might not be able to update
> > even relation statistics.
> >

Sounds like an interesting idea, but does it really help? Because if
vacuum was interrupted previously, wouldn't it already know the dead
tuples, etc in the next run quite quickly, as the VM, FSM is already
updated for the page in the previous run.

A few minor things I noticed in the first look,
+/*
+ * When a table has no indexes, save the progress every 8GB so that we can
+ * resume vacuum from the middle of table. When table has indexes we save it
+ * after the second heap pass finished.
+ */
+#define VACUUM_RESUME_BLK_INTERVAL 1024 /* 8MB */
Discrepancy with the memory unit here.

/* No found valid saved block number, resume from the first block */
Can be better framed.

--
Regards,
Rafia Sabih




Re: [HACKERS] [WIP] Effective storage of duplicates in B-tree index.

2019-07-31 Thread Rafia Sabih
On Thu, 25 Jul 2019 at 05:49, Peter Geoghegan  wrote:
>
> On Wed, Jul 24, 2019 at 3:06 PM Peter Geoghegan  wrote:
> > There seems to be a kind of "synergy" between the nbtsplitloc.c
> > handling of pages that have lots of duplicates and posting list
> > compression. It seems as if the former mechanism "sets up the bowling
> > pins", while the latter mechanism "knocks them down", which is really
> > cool. We should try to gain a better understanding of how that works,
> > because it's possible that it could be even more effective in some
> > cases.
>
> I found another important way in which this synergy can fail to take
> place, which I can fix.
>
> By removing the BT_COMPRESS_THRESHOLD limit entirely, certain indexes
> from my test suite become much smaller, while most are not affected.
> These indexes were not helped too much by the patch before. For
> example, the TPC-E i_t_st_id index is 50% smaller. It is entirely full
> of duplicates of a single value (that's how it appears after an
> initial TPC-E bulk load), as are a couple of other TPC-E indexes.
> TPC-H's idx_partsupp_partkey index becomes ~18% smaller, while its
> idx_lineitem_orderkey index becomes ~15% smaller.
>
> I believe that this happened because rightmost page splits were an
> inefficient case for compression. But rightmost page split heavy
> indexes with lots of duplicates are not that uncommon. Think of any
> index with many NULL values, for example.
>
> I don't know for sure if BT_COMPRESS_THRESHOLD should be removed. I'm
> not sure what the idea is behind it. My sense is that we're likely to
> benefit by delaying page splits, no matter what. Though I am still
> looking at it purely from a space utilization point of view, at least
> for now.
>
Minor comment fix, pointes-->pointer, plus, are we really doing the
half, or is it just splitting into two.
/*
+ * Split posting tuple into two halves.
+ *
+ * Left tuple contains all item pointes less than the new one and
+ * right tuple contains new item pointer and all to the right.
+ *
+ * TODO Probably we can come up with more clever algorithm.
+ */

Some remains of 'he'.
+/*
+ * If tuple is posting, t_tid.ip_blkid contains offset of the posting list.
+ * Caller is responsible for checking BTreeTupleIsPosting to ensure that
+ * it will get what he expects
+ */

Everything reads just fine without 'us'.
/*
+ * This field helps us to find beginning of the remaining tuples from
+ * postings which follow array of offset numbers.
+ */
-- 
Regards,
Rafia Sabih




Re: proposal - patch: psql - sort_by_size

2019-07-31 Thread Rafia Sabih
On Mon, 15 Jul 2019 at 06:12, Pavel Stehule  wrote:
>
> Hi
>
> pá 12. 7. 2019 v 15:10 odesílatel Fabien COELHO  napsal:
>>
>>
>> Hello Pavel,
>>
>> > rebased patch attached
>>
>> I prefer patches with a number rather than a date, if possible. For one
>> thing, there may be several updates in one day.
>>
>> About this version (20180708, probably v3): applies cleanly, compiles,
>> make check ok, doc build ok. No tests.
>
>
> attached version 4
>
>>
>> It works for me on a few manual tests against a 11.4 server.
>>
>> Documentation: if you say "\d*+", then it already applies to \db+ and
>> \dP+, so why listing them? Otherwise, state all commands or make it work
>> on all commands that have a size?
>>
>> About the text:
>>- remove , before "sorts"
>>- ... outputs by decreasing size, when size is displayed.
>>- add: When size is not displayed, the output is sorted by names.
>
>
> fixed
>
>>
>> I still think that the object name should be kept as a secondary sort
>> criterion, in case of size equality, so that the output is deterministic.
>> Having plenty of objects of the same size out of alphabetical order looks
>> very strange.
>
>
> fixed
>
> Regards
>
> Pavel
>>
>>
>> I still do not like much the boolean approach. I understand that the name
>> approach has been rejected, and I can understand why.
>>
>> I've been thinking about another more generic interface, that I'm putting
>> here for discussion, I do not claim that it is a good idea. Probably could
>> fall under "over engineering", but it might not be much harder to
>> implement, and it solves a few potential problems.
>>
>> The idea is to add an option to \d commands, such as "\echo -n":
>>
>>\dt+ [-o 1d,2a] ...
>>
>> meaning do the \dt+, order by column 1 descending, column 2 ascending.
>> With this there would be no need for a special variable nor other
>> extensions to specify some ordering, whatever the user wishes.
>>
>> Maybe it could be "\dt+ [-o '1 DESC, 2 ASC'] ..." so that the string
>> is roughly used as an ORDER BY specification by the query, but it would be
>> longer to specify.
>>
>> It also solves the issue that if someone wants another sorting order we
>> would end with competing boolean variables such as SORT_BY_SIZE,
>> SORT_BY_TYPE, SORT_BY_SCHEMA, which would be pretty unpractical. The
>> boolean approach works for *one* sorting extension and breaks at the next
>> extension.
>>
>> Also, the boolean does not say that it is a descending order. I could be
>> interested in looking at the small tables.
>>
>> Another benefit for me is that I do not like much variables with side
>> effects, whereas with an explicit syntax there would be no such thing, the
>> user has what was asked for. Ok, psql is full of them, but I cannot say I
>> like it for that.
>>
>> The approach could be extended to specify a limit, eg \dt -l 10 would
>> add a LIMIT 10 on the query.
>>
>> Also, the implementation could be high enough so that the description
>> handlers would not have to deal with it individually, it could return
>> the query which would then be completed with SORT/LIMIT clauses before
>> being executed, possibly with a default order if none is specified.

I had a look at this patch, seems like a useful thing to have.
One clarification though,
What is the reason for compatibility with different versions in
listAllDbs and describeTablespaces, precisely

if (verbose && pset.sversion >= 90200)
+ {
  appendPQExpBuffer(,
",\n  pg_catalog.pg_size_pretty(pg_catalog.pg_tablespace_size(oid))
AS \"%s\"",
gettext_noop("Size"));
+ sizefunc = "pg_catalog.pg_tablespace_size(oid)";
+ }
in describeTablespaces but
if (verbose && pset.sversion >= 80200)
+ {
  appendPQExpBuffer(,
",\n   CASE WHEN pg_catalog.has_database_privilege(d.datname,
'CONNECT')\n"
"THEN
pg_catalog.pg_size_pretty(pg_catalog.pg_database_size(d.datname))\n"
"ELSE 'No Access'\n"
"   END as \"%s\"",
gettext_noop("Size"));
+ sizefunc = "pg_catalog.pg_database_size(d.datname)";
+ }
in listAllDbs.


-- 
Regards,
Rafia Sabih




Re: Initdb failure

2019-07-25 Thread Rafia Sabih
On Thu, 25 Jul 2019 at 16:44, Tom Lane  wrote:
>
> Rafia Sabih  writes:
> > On Thu, 25 Jul 2019 at 13:50, vignesh C  wrote:
> >>> Initdb fails when following path is provided as input:
> >>> datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/
>
> > Now that you say this, it does make sense to atleast inform about the
> > correct error and that too earlier. Something like the attached patch
> > would make sense.
>
> I am not terribly excited about putting effort into this at all, because
> I don't think that any actual user anywhere will ever get any benefit.
> The proposed test case is just silly.

That I totally agree upon!
But on the other hand emitting the right error message atleast would
be good for the sake of correctness if nothing else. But yes that
definitely should be weighed against what is the effort required for
this.

-- 
Regards,
Rafia Sabih




Re: Initdb failure

2019-07-25 Thread Rafia Sabih
On Thu, 25 Jul 2019 at 13:50, vignesh C  wrote:
>
> On Thu, Jul 25, 2019 at 4:52 PM Rafia Sabih  wrote:
> >
> > On Thu, 25 Jul 2019 at 07:39, vignesh C  wrote:
> > >
> > > Hi,
> > >
> > > Initdb fails when following path is provided as input:
> > > datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/
> > >
> > > Also the cleanup also tends to fail in the cleanup path.
> > >
> > > Could be something to do with path handling.
> > This is because the value of MAXPGPATH is 1024 and the path you are
> > providing is more than that. Hence, when it is trying to read
> > PG_VERSION in ValidatePgVersion it is going to a wrong path with just
> > 1024 characters.
> >
>
> The error occurs at a very later point after performing the initial
> work like creating directory.  I'm thinking we should check this in
> the beginning and throw the error message at the beginning and exit
> cleanly.
>
Now that you say this, it does make sense to atleast inform about the
correct error and that too earlier. Something like the attached patch
would make sense.

-- 
Regards,
Rafia Sabih
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 04d77ad700..3af11b365b 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2726,6 +2726,12 @@ create_data_directory(void)
 {
 	int			ret;
 
+	if (strlen(pg_data) > MAXPGPATH)
+	{
+		pg_log_error("too long directory name\"%s\": %m", pg_data);
+		exit(1);
+	}
+
 	switch ((ret = pg_check_dir(pg_data)))
 	{
 		case 0:


Re: Initdb failure

2019-07-25 Thread Rafia Sabih
On Thu, 25 Jul 2019 at 07:39, vignesh C  wrote:
>
> Hi,
>
> Initdb fails when following path is provided as input:
> datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/datasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafddsdatasadfasfdsafdds/
>
> Also the cleanup also tends to fail in the cleanup path.
>
> Could be something to do with path handling.
This is because the value of MAXPGPATH is 1024 and the path you are
providing is more than that. Hence, when it is trying to read
PG_VERSION in ValidatePgVersion it is going to a wrong path with just
1024 characters.

> I'm not sure if this is already known.
I am also not sure if it is known or intentional. On the other hand I
also don't know if it is practical to give such long names for
database directory anyway.

-- 
Regards,
Rafia Sabih




Fwd: [HACKERS] [WIP] Effective storage of duplicates in B-tree index.

2019-07-11 Thread Rafia Sabih
On Sun, 7 Jul 2019 at 01:08, Peter Geoghegan  wrote:

> * Maybe we could do compression with unique indexes when inserting
> values with NULLs? Note that we now treat an insertion of a tuple with
+1

I tried this patch and found the improvements impressive. However,
when I tried with multi-column indexes it wasn't giving any
improvement, is it the known limitation of the patch?
I am surprised to find that such a patch is on radar since quite some
years now and not yet committed.

Going through the patch, here are a few comments from me,

 /* Add the new item into the page */
+ offnum = OffsetNumberNext(offnum);
+
+ elog(DEBUG4, "insert_itupprev_to_page. compressState->ntuples %d
IndexTupleSize %zu free %zu",
+ compressState->ntuples, IndexTupleSize(to_insert), PageGetFreeSpace(page));
+
and other such DEBUG4 statements are meant to be removed, right...?
Just because I didn't find any other such statements in this API and
there are many in this patch, so not sure how much are they needed.

/*
* If we have only 10 uncompressed items on the full page, it probably
* won't worth to compress them.
*/
if (maxoff - n_posting_on_page < 10)
return;

Is this a magic number...?

/*
* We do not expect to meet any DEAD items, since this function is
* called right after _bt_vacuum_one_page(). If for some reason we
* found dead item, don't compress it, to allow upcoming microvacuum
* or vacuum clean it up.
*/
if (ItemIdIsDead(itemId))
continue;

This makes me wonder about those 'some' reasons.

Caller is responsible for checking BTreeTupleIsPosting to ensure that
+ * he will get what he expects

This can be re-framed to make the caller more gender neutral.

Other than that, I am curious about the plans for its backward compatibility.

--
Regards,
Rafia Sabih




Re: Adaptive query optimization

2019-06-13 Thread Rafia Sabih
On Thu, 13 Jun 2019 at 06:07, Kuntal Ghosh  wrote:
>
> On Thu, Jun 13, 2019 at 5:49 AM Tomas Vondra
>  wrote:
> >
> > For example, we might require 1000 samples for a given node (say, scan
> > with some quals), before we start using it to tweak the estimates. Once
> > we get the number of estimates, we can continue collecting more data,
> > and once in a while update the correction. This would require some care,
> > of course, because we need to know what coefficient was used to compute
> > the estimate, but that's solvable by having some sort of epoch.
> >
> > Of course, the question is what number should we use, but overall this
> > would be a much lower-overhead way to do the learning.
> >
> > Unfortunately, the learning as implemented in the patch does not allow
> > this. It pretty much requires dedicated learning phase with generated
> > workload, in a single process.
> >
> > But I think that's solvable, assuming we:
> >
> > 1) Store the data in shared memory, instead of a file. Collect data from
> > all backends, instead of just a single one, etc.
> >
> > 2) Make the decision for individual entries, depending on how many
> > samples we have for it.
> >
> Sounds good. I was trying to think whether we can maintain a running
> coefficient. In that way, we don't have to store the samples. But,
> calculating a running coefficient for more than two variables (with
> some single pass algorithm) seems to be a hard problem. Moreover, it
> can introduce significant misestimation. Your suggested approach works
> better.
>
> > I suggest we try to solve one issue at a time. I agree advising which
> > indexes to create is a very interesting (and valuable) thing, but I see
> > it as an extension of the AQO feature. That is, basic AQO (tweaking row
> > estimates) can work without it.
> >
> +1
>
> > >> Now, if someone uses this same scan in a join, like for example
> > >>
> > >>SELECT * FROM t1 JOIN t2 ON (t1.id = t2.id)
> > >> WHERE (t1.a = ? AND t1.b = ? AND t1.c < ?)
> > >>   AND (t2.x = ? AND t2.y = ?)
> > >>
> > >> then we can still apply the same correction to the t1 scan (I think).
> > >> But then we can also collect data for the t1-t2 join, and compute a
> > >> correction coefficient in a similar way. It requires a bit of care
> > >> because we need to compensate for misestimates of inputs, but I think
> > >> that's doable.
> > >>
> > >That'll be an interesting work. For the above query, we can definitely
> > >calculate the correction coefficient of t1-t2 join given (t1.a = ? AND
> > >t1.b = ? AND t1.c < ?) and
> > >(t2.x = ? AND t2.y = ?) are true. But, I'm not sure how we can
> > >extrapolate that value for t1-t2 join.
> >
> > I'm not sure I see the problem? Essentially, we need to know the sizes
> > of the join inputs, i.e.
> >
> > t1 WHERE (t1.a = ? AND t1.b = ? AND t1.c < ?)
> >
> > t2 WHERE (t2.x = ? AND t2.y = ?)
> >
> > (which we know, and we know how to correct the estimate), and then the
> > selectivity of the join condition. Which we also know.
> >
> > Obviously, there's a chance those parts (clauses at the scan / join
> > level) are correlated, which could make this less accurate.
> This is exactly what my concern is. The base predicate selectivities
> of t1 and t2 should have an impact on the calculation of the
> correction coefficient. If those selectivities are low, the
> misestimation (which is actual/estimate) should not affect the t1-t2
> join correction coefficient much.
>
Interesting discussion. Talking of query optimization techniques and
challenges, isn't the biggest challenge there is of selectivity
estimation? Then instead of working on optimizing the process which
has been talked of since long, how about skipping the process
altogether. This reminds of the work I came across sometime back[1].
Basically, the idea is to not spend any energy on estimation the
selectivities rather get on with the execution. Precisely, a set of
plans is kept apriori for different selectivities and at the execution
time it starts with the plans one by one, starting from the lower
selectivity one till the query execution completes. It might sound
like too much work but it isn't, there are some theoretical guarantees
to bound the worst case execution time. The trick is in choosing the
plan-set and switching at the time of execution. Another good point
about this is that it works smoothly for join predicates as well.

Since, we are talking about this problem here, I though it might be a
good idea to shed some light on such an approach and see if there is
some interesting trick we might use.

[1] https://dsl.cds.iisc.ac.in/publications/conference/bouquet.pdf

-- 
Regards,
Rafia Sabih




Re: [PATCH] Incremental sort (was: PoC: Partial sort)

2019-06-05 Thread Rafia Sabih
of the group, and if we run into an unexpectedly large one we
> > might abandon the incremental sort and switch to a "full sort" mode.
>
> Are there good examples of our doing this in other types of nodes
> (whether the fallback is an entirely different algorithm/node type)? I
> like this idea in theory, but I also think it's likely it would add a
> very significant amount of complexity. The other problem is knowing
> where to draw the line: you end up creating these kinds of cliffs
> where pulling one more tuple through the incremental sort would give
> you your batch and result in not having to pull many more tuples in a
> regular sort node, but the fallback logic kicks in anyway.
>
What about having some simple mechanism for this, like if we encounter
the group with more tuples than the one estimated then simply switch
to normal sort for the remaining tuples, as the estimates does not
hold true anyway. Atleast this will not give issues of having
regressions of incremental sort being too bad than the normal sort.
I mean having something like this, populate the tuplesortstate and
keep on counting the number of tuples in a group, if they are within
the budget call tuplesort_performsort, otherwise put all the tuples in
the tuplesort and then call tuplesort_performsort. We may have an
additional field in IncrementalSortState to save the estimated size of
each group. I am assuming that we use MCV lists to approximate better
the group sizes as suggested above by Tomas.

> Unrelated to all of the above: if I read the patch properly it
> intentionally excludes backwards scanning. I don't see any particular
> reason why that ought to be the case, and it seems like an odd
> limitation for the feature should it be merged. Should that be a
> blocker to merging?

Regarding this, I came across this,
/*
  * Incremental sort can't be used with either EXEC_FLAG_REWIND,
  * EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK, because we hold only current
  * bucket in tuplesortstate.
  */
I think that is quite understandable. How are you planning to support
backwards scan for this? In other words, when will incremental sort be
useful for backward scan.

On a different note, I can't stop imagining this operator on the lines
similar to parallel-append, wherein multiple workers can sort the
different groups independently at the same time.

-- 
Regards,
Rafia Sabih




Re: Index Skip Scan

2019-06-01 Thread Rafia Sabih
On Sat, 1 Jun 2019 at 06:10, Floris Van Nee  wrote:
>
> Actually I'd like to add something to this. I think I've found a bug in the 
> current implementation. Would someone be able to check?
>
I am willing to give it a try.
> Given a table definition of (market text, feedcode text, updated_at 
> timestamptz, value float8) and an index on (market, feedcode, updated_at 
> desc) (note that this table slightly deviates from what I described in my 
> previous mail) and filling it with data.
>
>
> The following query uses an index skip scan and returns just 1 row 
> (incorrect!)
>
> select distinct on (market, feedcode) market, feedcode
> from streams.base_price
> where market='TEST'
>
> The following query still uses the regular index scan and returns many more 
> rows (correct)
> select distinct on (market, feedcode) *
> from streams.base_price
> where market='TEST'
>
Aren't those two queries different?
select distinct on (market, feedcode) market, feedcode vs select
distinct on (market, feedcode)*
Anyhow, it's just the difference in projection so doesn't matter much.
I verified this scenario at my end and you are right, there is a bug.
Here is my repeatable test case,

create table t (market text, feedcode text, updated_at timestamptz,
value float8) ;
create index on t (market, feedcode, updated_at desc);
insert into t values('TEST', 'abcdef', (select timestamp '2019-01-10
20:00:00' + random() * (timestamp '2014-01-20 20:00:00' - timestamp
'2019-01-20 20:00:00') ), generate_series(1,100)*9.88);
insert into t values('TEST', 'jsgfhdfjd', (select timestamp
'2019-01-10 20:00:00' + random() * (timestamp '2014-01-20 20:00:00' -
timestamp '2019-01-20 20:00:00') ), generate_series(1,100)*9.88);

Now, without the patch,
select distinct on (market, feedcode) market, feedcode from t  where
market='TEST';
 market | feedcode
+---
 TEST   | abcdef
 TEST   | jsgfhdfjd
(2 rows)
explain select distinct on (market, feedcode) market, feedcode from t
where market='TEST';
   QUERY PLAN

 Unique  (cost=12.20..13.21 rows=2 width=13)
   ->  Sort  (cost=12.20..12.70 rows=201 width=13)
 Sort Key: feedcode
 ->  Seq Scan on t  (cost=0.00..4.51 rows=201 width=13)
   Filter: (market = 'TEST'::text)
(5 rows)

And with the patch,
select distinct on (market, feedcode) market, feedcode from t   where
market='TEST';
 market | feedcode
+--
 TEST   | abcdef
(1 row)

explain select distinct on (market, feedcode) market, feedcode from t
where market='TEST';
   QUERY PLAN

 Index Only Scan using t_market_feedcode_updated_at_idx on t
(cost=0.14..0.29 rows=2 width=13)
   Scan mode: Skip scan
   Index Cond: (market = 'TEST'::text)
(3 rows)

Notice that in the explain statement it shows correct number of rows
to be skipped.

-- 
Regards,
Rafia Sabih




Re: New EXPLAIN option: ALL

2019-05-07 Thread Rafia Sabih
On Tue, 7 May 2019 at 09:30, David Fetter  wrote:
>
> Folks,
>
> It can get a little tedious turning on (or off) all the boolean
> options to EXPLAIN, so please find attached a shortcut.
>

I don't understand this, do you mind explaining a bit may be with an
example on how you want it to work.
-- 
Regards,
Rafia Sabih




Re: Pluggable Storage - Andres's take

2019-05-07 Thread Rafia Sabih
On Mon, 6 May 2019 at 22:39, Ashwin Agrawal  wrote:
>
> On Mon, May 6, 2019 at 7:14 AM Andres Freund  wrote:
> >
> > Hi,
> >
> > On May 6, 2019 3:40:55 AM PDT, Rafia Sabih  
> > wrote:
> > >I was trying the toyam patch and on make check it failed with
> > >segmentation fault at
> > >
> > >static void
> > >toyam_relation_set_new_filenode(Relation rel,
> > > char persistence,
> > > TransactionId *freezeXid,
> > > MultiXactId *minmulti)
> > >{
> > > *freezeXid = InvalidTransactionId;
> > >
> > >Basically, on running create table t (i int, j int) using toytable,
> > >leads to this segmentation fault.
> > >
> > >Am I missing something here?
> >
> > I assume you got compiler warmings compiling it? The API for some callbacks 
> > changed a bit.
>
> Attached patch gets toy table AM implementation to match latest master API.
> The patch builds on top of patch from Heikki in [1].
> Compiles and works but the test still continues to fail with WARNING
> for issue mentioned in [1]
>
Thanks Ashwin, this works fine with the mentioned warnings of course.

-- 
Regards,
Rafia Sabih




Re: Pluggable Storage - Andres's take

2019-05-07 Thread Rafia Sabih
On Mon, 6 May 2019 at 16:14, Andres Freund  wrote:
>
> Hi,
>
> On May 6, 2019 3:40:55 AM PDT, Rafia Sabih  wrote:
> >On Tue, 9 Apr 2019 at 15:17, Heikki Linnakangas 
> >wrote:
> >>
> >> On 08/04/2019 20:37, Andres Freund wrote:
> >> > On 2019-04-08 15:34:46 +0300, Heikki Linnakangas wrote:
> >> >> There's a little bug in index-only scan executor node, where it
> >mixes up the
> >> >> slots to hold a tuple from the index, and from the table. That
> >doesn't cause
> >> >> any ill effects if the AM uses TTSOpsHeapTuple, but with my toy
> >AM, which
> >> >> uses a virtual slot, it caused warnings like this from index-only
> >scans:
> >> >
> >> > Hm. That's another one that I think I had fixed previously :(, and
> >then
> >> > concluded that it's not actually necessary for some reason. Your
> >fix
> >> > looks correct to me.  Do you want to commit it? Otherwise I'll look
> >at
> >> > it after rebasing zheap, and checking it with that.
> >>
> >> I found another slot type confusion bug, while playing with zedstore.
> >In
> >> an Index Scan, if you have an ORDER BY key that needs to be
> >rechecked,
> >> so that it uses the reorder queue, then it will sometimes use the
> >> reorder queue slot, and sometimes the table AM's slot, for the scan
> >> slot. If they're not of the same type, you get an assertion:
> >>
> >> TRAP: FailedAssertion("!(op->d.fetch.kind == slot->tts_ops)", File:
> >> "execExprInterp.c", Line: 1905)
> >>
> >> Attached is a test for this, again using the toy table AM, extended
> >to
> >> be able to test this. And a fix.
> >>
> >> >> Attached is a patch with the toy implementation I used to test
> >this. I'm not
> >> >> suggesting we should commit that - although feel free to do that
> >if you
> >> >> think it's useful - but it shows how I bumped into these issues.
> >> >
> >> > Hm, probably not a bad idea to include something like it. It seems
> >like
> >> > we kinda would need non-stub implementation of more functions for
> >it to
> >> > test much / and to serve as an example.  I'm mildy inclined to just
> >do
> >> > it via zheap / externally, but I'm not quite sure that's good
> >enough.
> >>
> >> Works for me.
> >>
> >> >> +static Size
> >> >> +toyam_parallelscan_estimate(Relation rel)
> >> >> +{
> >> >> +ereport(ERROR,
> >> >> +(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
> >> >> + errmsg("function %s not implemented yet",
> >__func__)));
> >> >> +}
> >> >
> >> > The other stubbed functions seem like we should require them, but I
> >> > wonder if we should make the parallel stuff optional?
> >>
> >> Yeah, that would be good. I would assume it to be optional.
> >>
> >I was trying the toyam patch and on make check it failed with
> >segmentation fault at
> >
> >static void
> >toyam_relation_set_new_filenode(Relation rel,
> > char persistence,
> > TransactionId *freezeXid,
> > MultiXactId *minmulti)
> >{
> > *freezeXid = InvalidTransactionId;
> >
> >Basically, on running create table t (i int, j int) using toytable,
> >leads to this segmentation fault.
> >
> >Am I missing something here?
>
> I assume you got compiler warmings compiling it? The API for some callbacks 
> changed a bit.
>
Oh yeah it does.


-- 
Regards,
Rafia Sabih




Re: Pluggable Storage - Andres's take

2019-05-06 Thread Rafia Sabih
On Tue, 9 Apr 2019 at 15:17, Heikki Linnakangas  wrote:
>
> On 08/04/2019 20:37, Andres Freund wrote:
> > On 2019-04-08 15:34:46 +0300, Heikki Linnakangas wrote:
> >> There's a little bug in index-only scan executor node, where it mixes up 
> >> the
> >> slots to hold a tuple from the index, and from the table. That doesn't 
> >> cause
> >> any ill effects if the AM uses TTSOpsHeapTuple, but with my toy AM, which
> >> uses a virtual slot, it caused warnings like this from index-only scans:
> >
> > Hm. That's another one that I think I had fixed previously :(, and then
> > concluded that it's not actually necessary for some reason. Your fix
> > looks correct to me.  Do you want to commit it? Otherwise I'll look at
> > it after rebasing zheap, and checking it with that.
>
> I found another slot type confusion bug, while playing with zedstore. In
> an Index Scan, if you have an ORDER BY key that needs to be rechecked,
> so that it uses the reorder queue, then it will sometimes use the
> reorder queue slot, and sometimes the table AM's slot, for the scan
> slot. If they're not of the same type, you get an assertion:
>
> TRAP: FailedAssertion("!(op->d.fetch.kind == slot->tts_ops)", File:
> "execExprInterp.c", Line: 1905)
>
> Attached is a test for this, again using the toy table AM, extended to
> be able to test this. And a fix.
>
> >> Attached is a patch with the toy implementation I used to test this. I'm 
> >> not
> >> suggesting we should commit that - although feel free to do that if you
> >> think it's useful - but it shows how I bumped into these issues.
> >
> > Hm, probably not a bad idea to include something like it. It seems like
> > we kinda would need non-stub implementation of more functions for it to
> > test much / and to serve as an example.  I'm mildy inclined to just do
> > it via zheap / externally, but I'm not quite sure that's good enough.
>
> Works for me.
>
> >> +static Size
> >> +toyam_parallelscan_estimate(Relation rel)
> >> +{
> >> +ereport(ERROR,
> >> +(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
> >> + errmsg("function %s not implemented yet", 
> >> __func__)));
> >> +}
> >
> > The other stubbed functions seem like we should require them, but I
> > wonder if we should make the parallel stuff optional?
>
> Yeah, that would be good. I would assume it to be optional.
>
I was trying the toyam patch and on make check it failed with
segmentation fault at

static void
toyam_relation_set_new_filenode(Relation rel,
 char persistence,
 TransactionId *freezeXid,
 MultiXactId *minmulti)
{
 *freezeXid = InvalidTransactionId;

Basically, on running create table t (i int, j int) using toytable,
leads to this segmentation fault.

Am I missing something here?


-- 
Regards,
Rafia Sabih




Re: make \d pg_toast.foo show its indices

2019-05-06 Thread Rafia Sabih
On Fri, 3 May 2019 at 16:27, Justin Pryzby  wrote:
>
> On Fri, May 03, 2019 at 02:55:47PM +0200, Rafia Sabih wrote:
> > On Mon, 22 Apr 2019 at 17:49, Justin Pryzby  wrote:
> > >
> > > It's deliberate that \dt doesn't show toast tables.
> > > \d shows them, but doesn't show their indices.
> > >
> > > It seems to me that their indices should be shown, without having to 
> > > think and
> > > know to query pg_index.
> > >
> > > postgres=# \d pg_toast.pg_toast_2600
> > > TOAST table "pg_toast.pg_toast_2600"
> > >Column   |  Type
> > > +-
> > >  chunk_id   | oid
> > >  chunk_seq  | integer
> > >  chunk_data | bytea
> > > Indexes:
> > > "pg_toast_2600_index" PRIMARY KEY, btree (chunk_id, chunk_seq)
> >
> > +1.
>
> Thanks - what about also showing the associated non-toast table ?
>
IMHO, what makes more sense is to show the name of associated toast
table in the \dt+ of the normal table.


-- 
Regards,
Rafia Sabih




Re: make \d pg_toast.foo show its indices

2019-05-03 Thread Rafia Sabih
On Mon, 22 Apr 2019 at 17:49, Justin Pryzby  wrote:
>
> It's deliberate that \dt doesn't show toast tables.
> \d shows them, but doesn't show their indices.
>
> It seems to me that their indices should be shown, without having to think and
> know to query pg_index.
>
> postgres=# \d pg_toast.pg_toast_2600
> TOAST table "pg_toast.pg_toast_2600"
>Column   |  Type
> +-
>  chunk_id   | oid
>  chunk_seq  | integer
>  chunk_data | bytea
> Indexes:
> "pg_toast_2600_index" PRIMARY KEY, btree (chunk_id, chunk_seq)
>
+1.


-- 
Regards,
Rafia Sabih




Re: [PATCH v1] Show whether tables are logged in \dt+

2019-05-03 Thread Rafia Sabih
On Sat, 27 Apr 2019 at 06:18, David Fetter  wrote:
>
> On Fri, Apr 26, 2019 at 04:22:18PM +0200, Rafia Sabih wrote:
> > On Fri, 26 Apr 2019 at 14:49, Rafia Sabih  wrote:
> > >
> > > On Wed, 24 Apr 2019 at 10:30, Fabien COELHO  wrote:
> > > >
> > > >
> > > > Hello David,
> > > >
> > > > >>> I noticed that there wasn't a bulk way to see table logged-ness in 
> > > > >>> psql,
> > > > >>> so I made it part of \dt+.
> > > > >>
> > > > >> Applies, compiles, works for me.
> > > > >>
> > > > >> ISTM That temporary-ness is not shown either. Maybe the persistence 
> > > > >> column
> > > > >> should be shown as is?
> > > > >
> > > > > Temporariness added, but not raw.
> > > >
> > > > Ok, it is better like this way.
> > > >
> > > > >> Tests?
> > > > >
> > > > > Included, but they're not stable for temp tables. I'm a little stumped
> > > > > as to how to either stabilize them or test some other way.
> > > >
> > > > Hmmm. First there is the username which appears, so there should be a
> > > > dedicated user for the test.
> > > >
> > > > I'm unsure how to work around the temporary schema number, which is
> > > > undeterministic with parallel execution it. I'm afraid the only viable
> > > > approach is not to show temporary tables, too bad:-(
> > > >
> > > > >> Doc?
> > > > >
> > > > > What further documentation does it need?
> > > >
> > > > Indeed, there is no precise doc, so nothing to update :-)/:-(
> > > >
> > > >
> > > > Maybe you could consider adding a case for prior 9.1 version, something
> > > > like:
> > > >... case c.relistemp then 'temporary' else 'permanent' end as ...
> > > >
> > > >
> > > I was reviewing this patch and found a bug,
> > >
> > > create table t (i int);
> > > create index idx on t(i);
> > > \di+
> > > psql: print.c:3452: printQuery: Assertion `opt->translate_columns ==
> > > ((void *)0) || opt->n_translate_columns >= cont.ncolumns' failed.
> >
> > Looking into this further, apparently the position of
> >
> >   if (verbose)
> >   {
> > + /*
> > + * Show whether the table is permanent, temporary, or unlogged.
> > + */
> > + if (pset.sversion >= 91000)
> > + appendPQExpBuffer(,
> > +   ",\n  case c.relpersistence when 'p' then 'permanent' when 't'
> > then 'temporary' when 'u' then 'unlogged' else 'unknown' end as
> > \"%s\"",
> > +   gettext_noop("Persistence"));
> >
> > is not right, it is being called for indexes with verbose option also.
> > There should be an extra check for it being not called for index case.
> > Something like,
> > if (verbose)
> > {
> > /*
> > * Show whether the table is permanent, temporary, or unlogged.
> > */
> > if (!showIndexes)
> > if (pset.sversion >= 91000)
> > appendPQExpBuffer(,
> >   ",\n  case c.relpersistence when 'p' then 'permanent' when 't' then
> > 'temporary' when 'u' then 'unlogged' else 'unknown' end as \"%s\"",
> >   gettext_noop("Persistence"));
> >
> > Not sure, how do modify it in a more neat way.
>
> I suspect that as this may get a little messier, but I've made it
> fairly neat short of a major refactor.
>
I found the following warning on the compilation,
describe.c: In function ‘listTables’:
describe.c:3705:7: warning: this ‘if’ clause does not guard...
[-Wmisleading-indentation]
  else if (pset.sversion >= 80100)
   ^~
describe.c:3710:3: note: ...this statement, but the latter is
misleadingly indented as if it were guarded by the ‘if’
   appendPQExpBuffer(,

Talking of indentation, you might want to run pgindent once. Other
than that the patch looks good to me.
-- 
Regards,
Rafia Sabih




Re: Execute INSERT during a parallel operation

2019-04-26 Thread Rafia Sabih
On Sun, 14 Apr 2019 at 04:11, Donald Dong  wrote:
>
> Hi,
>
> I'm trying to use the SPI to save the executed plans in the ExecutorEnd. When 
> the plan involves multiple workers, the insert operations would trigger an 
> error: cannot execute INSERT during a parallel operation.
>
> I wonder if there's a different hook I can use when there's a gather node? or 
> other ways to get around?

A bit more detail on what are you trying to do exactly like what is
the query you or code you are dealing with, will be helpful in
providing suggestions,


-- 
Regards,
Rafia Sabih




Re: [PATCH v1] Show whether tables are logged in \dt+

2019-04-26 Thread Rafia Sabih
On Fri, 26 Apr 2019 at 14:49, Rafia Sabih  wrote:
>
> On Wed, 24 Apr 2019 at 10:30, Fabien COELHO  wrote:
> >
> >
> > Hello David,
> >
> > >>> I noticed that there wasn't a bulk way to see table logged-ness in psql,
> > >>> so I made it part of \dt+.
> > >>
> > >> Applies, compiles, works for me.
> > >>
> > >> ISTM That temporary-ness is not shown either. Maybe the persistence 
> > >> column
> > >> should be shown as is?
> > >
> > > Temporariness added, but not raw.
> >
> > Ok, it is better like this way.
> >
> > >> Tests?
> > >
> > > Included, but they're not stable for temp tables. I'm a little stumped
> > > as to how to either stabilize them or test some other way.
> >
> > Hmmm. First there is the username which appears, so there should be a
> > dedicated user for the test.
> >
> > I'm unsure how to work around the temporary schema number, which is
> > undeterministic with parallel execution it. I'm afraid the only viable
> > approach is not to show temporary tables, too bad:-(
> >
> > >> Doc?
> > >
> > > What further documentation does it need?
> >
> > Indeed, there is no precise doc, so nothing to update :-)/:-(
> >
> >
> > Maybe you could consider adding a case for prior 9.1 version, something
> > like:
> >... case c.relistemp then 'temporary' else 'permanent' end as ...
> >
> >
> I was reviewing this patch and found a bug,
>
> create table t (i int);
> create index idx on t(i);
> \di+
> psql: print.c:3452: printQuery: Assertion `opt->translate_columns ==
> ((void *)0) || opt->n_translate_columns >= cont.ncolumns' failed.

Looking into this further, apparently the position of

  if (verbose)
  {
+ /*
+ * Show whether the table is permanent, temporary, or unlogged.
+ */
+ if (pset.sversion >= 91000)
+ appendPQExpBuffer(,
+   ",\n  case c.relpersistence when 'p' then 'permanent' when 't'
then 'temporary' when 'u' then 'unlogged' else 'unknown' end as
\"%s\"",
+   gettext_noop("Persistence"));

is not right, it is being called for indexes with verbose option also.
There should be an extra check for it being not called for index case.
Something like,
if (verbose)
{
/*
* Show whether the table is permanent, temporary, or unlogged.
*/
if (!showIndexes)
if (pset.sversion >= 91000)
appendPQExpBuffer(,
  ",\n  case c.relpersistence when 'p' then 'permanent' when 't' then
'temporary' when 'u' then 'unlogged' else 'unknown' end as \"%s\"",
  gettext_noop("Persistence"));

Not sure, how do modify it in a more neat way.

-- 
Regards,
Rafia Sabih




Re: [PATCH v1] Show whether tables are logged in \dt+

2019-04-26 Thread Rafia Sabih
On Wed, 24 Apr 2019 at 10:30, Fabien COELHO  wrote:
>
>
> Hello David,
>
> >>> I noticed that there wasn't a bulk way to see table logged-ness in psql,
> >>> so I made it part of \dt+.
> >>
> >> Applies, compiles, works for me.
> >>
> >> ISTM That temporary-ness is not shown either. Maybe the persistence column
> >> should be shown as is?
> >
> > Temporariness added, but not raw.
>
> Ok, it is better like this way.
>
> >> Tests?
> >
> > Included, but they're not stable for temp tables. I'm a little stumped
> > as to how to either stabilize them or test some other way.
>
> Hmmm. First there is the username which appears, so there should be a
> dedicated user for the test.
>
> I'm unsure how to work around the temporary schema number, which is
> undeterministic with parallel execution it. I'm afraid the only viable
> approach is not to show temporary tables, too bad:-(
>
> >> Doc?
> >
> > What further documentation does it need?
>
> Indeed, there is no precise doc, so nothing to update :-)/:-(
>
>
> Maybe you could consider adding a case for prior 9.1 version, something
> like:
>... case c.relistemp then 'temporary' else 'permanent' end as ...
>
>
I was reviewing this patch and found a bug,

create table t (i int);
create index idx on t(i);
\di+
psql: print.c:3452: printQuery: Assertion `opt->translate_columns ==
((void *)0) || opt->n_translate_columns >= cont.ncolumns' failed.

-- 
Regards,
Rafia Sabih




Re: Zedstore - compressed in-core columnar storage

2019-04-11 Thread Rafia Sabih
scan just the internal pages. Unless the
> column is very wide, that's only a small fraction of the data. That
> makes the space immediately reusable for new insertions, but it won't
> return the space to the Operating System. In order to do that, we'd
> still need to defragment, moving pages from the end of the file closer
> to the beginning, and truncate the file.
>
> In this design, we only cache compressed pages in the page cache. If
> we want to cache uncompressed pages instead, or in addition to that,
> we need to invent a whole new kind of a buffer cache that can deal
> with the variable-size blocks.
>
> If you do a lot of updates, the file can get fragmented, with lots of
> unused space on pages. Losing the correlation between TIDs and
> physical order is also bad, because it will make SeqScans slow, as
> they're not actually doing sequential I/O anymore. We can write a
> defragmenter to fix things up. Half-empty pages can be merged, and
> pages can be moved to restore TID/physical correlation. This format
> doesn't have the same MVCC problems with moving tuples around that the
> Postgres heap does, so it can be fairly easily be done on-line.
>
> Min-Max values can be stored for block to easily skip scanning if
> column values doesn't fall in range.
>
> Notes about current patch
> =
>
> Basic (core) functionality is implemented to showcase and play with.
>
> Two compression algorithms are supported Postgres pg_lzcompress and
> lz4. Compiling server with --with-lz4 enables the LZ4 compression for
> zedstore else pg_lzcompress is default. Definitely LZ4 is super fast
> at compressing and uncompressing.
>
> Not all the table AM API's are implemented. For the functionality not
> implmented yet will ERROR out with not supported. Zedstore Table can
> be created using command:
>
> CREATE TABLE  (column listing) USING zedstore;
>
> Bulk load can be performed using COPY. INSERT, SELECT, UPDATE and
> DELETES work. Btree indexes can be created. Btree and bitmap index
> scans work. Test in src/test/regress/sql/zedstore.sql showcases all
> the functionality working currently. Updates are currently implemented
> as cold, means always creates new items and not performed in-place.
>
> TIDs currently can't leverage the full 48 bit range but instead need
> to limit to values which are considered valid ItemPointers. Also,
> MaxHeapTuplesPerPage pose restrictions on the values currently it can
> have. Refer [7] for the same.
>
> Extremely basic UNDO logging has be implemented just for MVCC
> perspective. MVCC is missing tuple lock right now. Plus, doesn't
> actually perform any undo yet. No WAL logging exist currently hence
> its not crash safe either.
>
> Helpful functions to find how many pages of each type is present in
> zedstore table and also to find compression ratio is provided.
>
> Test mentioned in thread "Column lookup in a row performance" [6],
> good example query for zedstore locally on laptop using lz4 shows
>
> postgres=# SELECT AVG(i199) FROM (select i199 from layout offset 0) x; --
> heap
>  avg
> -
>  50.5000
> (1 row)
>
> Time: 4679.026 ms (00:04.679)
>
> postgres=# SELECT AVG(i199) FROM (select i199 from zlayout offset 0) x; --
> zedstore
>  avg
> -
>  50.5000
> (1 row)
>
> Time: 379.710 ms
>
> Important note:
> ---
> Planner has not been modified yet to leverage the columnar
> storage. Hence, plans using "physical tlist" optimization or such good
> for row store miss out to leverage the columnar nature
> currently. Hence, can see the need for subquery with OFFSET 0 above to
> disable the optimization and scan only required column.
>
>
>
> The current proposal and discussion is more focused on AM layer work
> first. Hence, currently intentionally skips to discuss the planner or
> executor "feature" enhancements like adding vectorized execution and
> family of features.
>
> Previous discussions or implementations for column store Vertical
> cluster index [2], Incore columnar storage [3] and [4], cstore_fdw [5]
> were refered to distill down objectives and come up with design and
> implementations to avoid any earlier concerns raised. Learnings from
> Greenplum Database column store also leveraged while designing and
> implementing the same.
>
> Credit: Design is moslty brain child of Heikki, or actually his
> epiphany to be exact. I acted as idea bouncing board and contributed
> enhancements to the same. We both are having lot of fun writing the
> code for this.
>
>
> References
> 1] https://github.com/greenplum-db/postgres/tree/zedstore
> 2]
> https://www.postgresql.org/message-id/flat/CAJrrPGfaC7WC9NK6PTTy6YN-NN%2BhCy8xOLAh2doYhVg5d6HsAA%40mail.gmail.com
> 3]
> https://www.postgresql.org/message-id/flat/20150611230316.GM133018%40postgresql.org
> 4]
> https://www.postgresql.org/message-id/flat/20150831225328.GM2912%40alvherre.pgsql
> 5] https://github.com/citusdata/cstore_fdw
> 6]
> https://www.postgresql.org/message-id/flat/CAOykqKfko-n5YiBJtk-ocVdp%2Bj92Apu5MJBwbGGh4awRY5NCuQ%40mail.gmail.com
> 7]
> https://www.postgresql.org/message-id/d0fc97bd-7ec8-2388-e4a6-0fda86d71a43%40iki.fi
>
>

-- 
Regards,
Rafia Sabih


Re: Zedstore - compressed in-core columnar storage

2019-04-11 Thread Rafia Sabih
On Tue, 9 Apr 2019 at 20:29, Robert Haas  wrote:

> On Tue, Apr 9, 2019 at 11:51 AM Alvaro Herrera 
> wrote:
> > This is not surprising, considering that columnar store is precisely the
> > reason for starting the work on table AMs.
> >
> > We should certainly look into integrating some sort of columnar storage
> > in mainline.  Not sure which of zedstore or VOPS is the best candidate,
> > or maybe we'll have some other proposal.  My feeling is that having more
> > than one is not useful; if there are optimizations to one that can be
> > borrowed from the other, let's do that instead of duplicating effort.
>
> I think that conclusion may be premature.  There seem to be a bunch of
> different ways of doing columnar storage, so I don't know how we can
> be sure that one size will fit all, or that the first thing we accept
> will be the best thing.
>
> Of course, we probably do not want to accept a ton of storage manager
> implementations is core.  I think if people propose implementations
> that are poor quality, or missing important features, or don't have
> significantly different use cases from the ones we've already got,
> it's reasonable to reject those.  But I wouldn't be prepared to say
> that if we have two significantly different column store that are both
> awesome code with a complete feature set and significantly disjoint
> use cases, we should reject the second one just because it is also a
> column store.  I think that won't get out of control because few
> people will be able to produce really high-quality implementations.
>
> This stuff is hard, which I think is also why we only have 6.5 index
> AMs in core after many, many years.  And our standards have gone up
> over the years - not all of those would pass muster if they were
> proposed today.
>
> BTW, can I express a small measure of disappointment that the name for
> the thing under discussion on this thread chose to be called
> "zedstore"?  That seems to invite confusion with "zheap", especially
> in parts of the world where the last letter of the alphabet is
> pronounced "zed," where people are going to say zed-heap and
> zed-store. Brr.
>

+1 on Brr. Looks like Thomas and your thought on having 'z'  makes things
popular/stylish, etc. is after all true, I was skeptical back then.

-- 
Regards,
Rafia Sabih


Re: Zedstore - compressed in-core columnar storage

2019-04-11 Thread Rafia Sabih
nar storage [3] and [4], cstore_fdw [5]
> were refered to distill down objectives and come up with design and
> implementations to avoid any earlier concerns raised. Learnings from
> Greenplum Database column store also leveraged while designing and
> implementing the same.
>
> Credit: Design is moslty brain child of Heikki, or actually his
> epiphany to be exact. I acted as idea bouncing board and contributed
> enhancements to the same. We both are having lot of fun writing the
> code for this.
>
>
> References
> 1] https://github.com/greenplum-db/postgres/tree/zedstore
> 2]
> https://www.postgresql.org/message-id/flat/CAJrrPGfaC7WC9NK6PTTy6YN-NN%2BhCy8xOLAh2doYhVg5d6HsAA%40mail.gmail.com
> 3]
> https://www.postgresql.org/message-id/flat/20150611230316.GM133018%40postgresql.org
> 4]
> https://www.postgresql.org/message-id/flat/20150831225328.GM2912%40alvherre.pgsql
> 5] https://github.com/citusdata/cstore_fdw
> 6]
> https://www.postgresql.org/message-id/flat/CAOykqKfko-n5YiBJtk-ocVdp%2Bj92Apu5MJBwbGGh4awRY5NCuQ%40mail.gmail.com
> 7]
> https://www.postgresql.org/message-id/d0fc97bd-7ec8-2388-e4a6-0fda86d71a43%40iki.fi
>
>
Reading about it reminds me of this work -- TAG column storage(
http://www09.sigmod.org/sigmod/record/issues/0703/03.article-graefe.pdf ).
Isn't this storage system inspired from there, with TID as the TAG?

It is not referenced here so made me wonder.
-- 
Regards,
Rafia Sabih


Re: explain plans with information about (modified) gucs

2019-04-01 Thread Rafia Sabih
On Fri, 29 Mar 2019 at 22:07, Tomas Vondra 
wrote:

> On Wed, Mar 27, 2019 at 09:06:04AM +0100, Rafia Sabih wrote:
> >On Tue, 26 Mar 2019 at 21:04, Tomas Vondra 
> >wrote:
> >
> >> On Mon, Mar 18, 2019 at 11:31:48AM +0100, Rafia Sabih wrote:
> >> >On Sun, 24 Feb 2019 at 00:06, Tomas Vondra <
> tomas.von...@2ndquadrant.com>
> >> wrote:
> >> >>
> >> >> Hi,
> >> >>
> >> >> attached is an updated patch, fixing and slightly tweaking the docs.
> >> >>
> >> >>
> >> >> Barring objections, I'll get this committed later next week.
> >> >>
> >> >I was having a look at this patch, and this kept me wondering,
> >> >
> >> >+static void
> >> >+ExplainShowSettings(ExplainState *es)
> >> >+{
> >> >Is there some reason for not providing any explanation above this
> >> >function just like the rest of the functions in this file?
> >> >
> >> >Similarly, for
> >> >
> >> >struct config_generic **
> >> >get_explain_guc_options(int *num)
> >> >{
> >> >
> >> >/* also bail out of there are no options */
> >> >+ if (!num)
> >> >+ return;
> >> >I think you meant 'if' instead if 'of' here.
> >>
> >> Thanks for the review - attached is a patch adding the missing comments,
> >> and doing two additional minor improvements:
> >>
> >> 1) Renaming ExplainShowSettings to ExplainPrintSettings, to make it more
> >> consistent with naming of the other functions in explain.c.
> >>
> >> 2) Actually counting GUC_EXPLAIN options, and only allocating space for
> >> those in get_explain_guc_options, instead of using num_guc_variables.
> The
> >> diffrence is quite significant (~50 vs. ~300), and considering each
> entry
> >> is 8B it makes a difference because such large chunks tend to have
> higher
> >> palloc overhed (due to ALLOCSET_SEPARATE_THRESHOLD).
> >>
> >>
> > Looks like the patch is in need of a rebase.
> >At commit: 1983af8e899389187026cb34c1ca9d89ea986120
> >
> >P.S. reject files attached.
> >
>
> D'oh! That was a stupid mistake - I apparently attched just the delta
> against
> the previous patch version, i.e. the improvements I described. Attaches is
> a
> correct (and complete) patch.
>
> I planned to get this committed today, but considering this I'll wait until
> early next week to allow for feedback.
>
>
> The patch looks good to me.

-- 
Regards,
Rafia Sabih


Re: explain plans with information about (modified) gucs

2019-03-27 Thread Rafia Sabih
On Tue, 26 Mar 2019 at 21:04, Tomas Vondra 
wrote:

> On Mon, Mar 18, 2019 at 11:31:48AM +0100, Rafia Sabih wrote:
> >On Sun, 24 Feb 2019 at 00:06, Tomas Vondra 
> wrote:
> >>
> >> Hi,
> >>
> >> attached is an updated patch, fixing and slightly tweaking the docs.
> >>
> >>
> >> Barring objections, I'll get this committed later next week.
> >>
> >I was having a look at this patch, and this kept me wondering,
> >
> >+static void
> >+ExplainShowSettings(ExplainState *es)
> >+{
> >Is there some reason for not providing any explanation above this
> >function just like the rest of the functions in this file?
> >
> >Similarly, for
> >
> >struct config_generic **
> >get_explain_guc_options(int *num)
> >{
> >
> >/* also bail out of there are no options */
> >+ if (!num)
> >+ return;
> >I think you meant 'if' instead if 'of' here.
>
> Thanks for the review - attached is a patch adding the missing comments,
> and doing two additional minor improvements:
>
> 1) Renaming ExplainShowSettings to ExplainPrintSettings, to make it more
> consistent with naming of the other functions in explain.c.
>
> 2) Actually counting GUC_EXPLAIN options, and only allocating space for
> those in get_explain_guc_options, instead of using num_guc_variables. The
> diffrence is quite significant (~50 vs. ~300), and considering each entry
> is 8B it makes a difference because such large chunks tend to have higher
> palloc overhed (due to ALLOCSET_SEPARATE_THRESHOLD).
>
>
> Looks like the patch is in need of a rebase.
At commit: 1983af8e899389187026cb34c1ca9d89ea986120

P.S. reject files attached.


-- 
Regards,
Rafia Sabih
--- src/backend/utils/misc/guc.c
+++ src/backend/utils/misc/guc.c
@@ -8837,6 +8855,11 @@ ShowAllGUCConfig(DestReceiver *dest)
 	end_tup_output(tstate);
 }
 
+/*
+ * Returns an array of modified GUC options to show in EXPLAIN. Only options
+ * related to query planning (marked with GUC_EXPLAIN), with values different
+ * from built-in defaults.
+ */
 struct config_generic **
 get_explain_guc_options(int *num)
 {
@@ -8844,7 +8867,13 @@ get_explain_guc_options(int *num)
 	struct config_generic **result;
 
 	*num = 0;
-	result = palloc(sizeof(struct config_generic *) * num_guc_variables);
+
+	/*
+	 * Allocate enough space to fit all GUC_EXPLAIN options. We may not
+	 * need all the space, but there are fairly few such options so we
+	 * don't waste a lot of memory.
+	 */
+	result = palloc(sizeof(struct config_generic *) * num_guc_explain_variables);
 
 	for (i = 0; i < num_guc_variables; i++)
 	{
@@ -8912,6 +8941,8 @@ get_explain_guc_options(int *num)
 		/* assign to the values array */
 		result[*num] = conf;
 		*num = *num + 1;
+
+		Assert(*num <= num_guc_explain_variables);
 	}
 
 	return result;
--- src/backend/commands/explain.c
+++ src/backend/commands/explain.c
@@ -599,8 +599,12 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 	ExplainCloseGroup("Query", NULL, true, es);
 }
 
+/*
+ * ExplainPrintSettings -
+ *Print summary of modified settings affecting query planning.
+ */
 static void
-ExplainShowSettings(ExplainState *es)
+ExplainPrintSettings(ExplainState *es)
 {
 	int		num;
 	struct config_generic **gucs;
@@ -700,10 +704,10 @@ ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc)
 	ExplainNode(ps, NIL, NULL, NULL, es);
 
 	/*
-	 * If requested, include information about GUC parameters that don't
-	 * match the built-in defaults.
+	 * If requested, include information about GUC parameters with values
+	 * that don't match the built-in defaults.
 	 */
-	ExplainShowSettings(es);
+	ExplainPrintSettings(es);
 }
 
 /*


Re: [HACKERS] CLUSTER command progress monitor

2019-03-18 Thread Rafia Sabih
On Fri, 8 Mar 2019 at 09:14, Tatsuro Yamada
 wrote:
>
> On 2019/03/06 15:38, Tatsuro Yamada wrote:
> > On 2019/03/05 17:56, Tatsuro Yamada wrote:
> >> On 2019/03/05 11:35, Robert Haas wrote:
> >>> On Mon, Mar 4, 2019 at 5:38 AM Tatsuro Yamada
> >>>  wrote:
> >>>> === Current design ===
> >>>>
> >>>> CLUSTER command uses Index Scan or Seq Scan when scanning the heap.
> >>>> Depending on which one is chosen, the command will proceed in the
> >>>> following sequence of phases:
> >>>>
> >>>> * Scan method: Seq Scan
> >>>>   0. initializing (*2)
> >>>>   1. seq scanning heap(*1)
> >>>>   3. sorting tuples   (*2)
> >>>>   4. writing new heap (*1)
> >>>>   5. swapping relation files  (*2)
> >>>>   6. rebuilding index (*2)
> >>>>   7. performing final cleanup (*2)
> >>>>
> >>>> * Scan method: Index Scan
> >>>>   0. initializing (*2)
> >>>>   2. index scanning heap  (*1)
> >>>>   5. swapping relation files  (*2)
> >>>>   6. rebuilding index (*2)
> >>>>   7. performing final cleanup (*2)
> >>>>
> >>>> VACUUM FULL command will proceed in the following sequence of phases:
> >>>>
> >>>>   1. seq scanning heap(*1)
> >>>>   5. swapping relation files  (*2)
> >>>>   6. rebuilding index (*2)
> >>>>   7. performing final cleanup (*2)
> >>>>
> >>>> (*1): increasing the value in heap_tuples_scanned column
> >>>> (*2): only shows the phase in the phase column
> >>>
> >>> All of that sounds good.
> >>>
> >>>> The view provides the information of CLUSTER command progress details as 
> >>>> follows
> >>>> # \d pg_stat_progress_cluster
> >>>> View "pg_catalog.pg_stat_progress_cluster"
> >>>> Column   |  Type   | Collation | Nullable | Default
> >>>> ---+-+---+--+-
> >>>>pid   | integer |   |  |
> >>>>datid | oid |   |  |
> >>>>datname   | name|   |  |
> >>>>relid | oid |   |  |
> >>>>command   | text|   |  |
> >>>>phase | text|   |  |
> >>>>cluster_index_relid   | bigint  |   |  |
> >>>>heap_tuples_scanned   | bigint  |   |  |
> >>>>heap_tuples_vacuumed  | bigint  |   |  |
> >>>
> >>> Still not sure if we need heap_tuples_vacuumed.  We could try to
> >>> report heap_blks_scanned and heap_blks_total like we do for VACUUM, if
> >>> we're using a Seq Scan.
> >>
> >> I have no strong opinion to add heap_tuples_vacuumed, so I'll remove that 
> >> in
> >> next patch.
> >>
> >> Regarding heap_blks_scanned and heap_blks_total, I suppose that it is able 
> >> to
> >> get those from initscan(). I'll investigate it more.
> >>
> >> cluster.c
> >>copy_heap_data()
> >>  heap_beginscan()
> >>heap_beginscan_internal()
> >>  initscan()
> >>
> >>
> >>
> >>>> === Discussion points ===
> >>>>
> >>>>- Progress counter for "3. sorting tuples" phase
> >>>>   - Should we add pgstat_progress_update_param() in tuplesort.c like 
> >>>> a
> >>>> "trace_sort"?
> >>>> Thanks to Peter Geoghegan for the useful advice!
> >>>
> >>> How would we avoid an abstraction violation?
> >>
> >> Hmm... What do you mean an abstraction violation?
> >> If it is difficult to solve, I'd not like to add the progress counter for 
> >> the sorting tuples.
> >>
> >>
> >>>>- Progress counter for "6. rebuilding index"

Re: explain plans with information about (modified) gucs

2019-03-18 Thread Rafia Sabih
On Sun, 24 Feb 2019 at 00:06, Tomas Vondra  wrote:
>
> Hi,
>
> attached is an updated patch, fixing and slightly tweaking the docs.
>
>
> Barring objections, I'll get this committed later next week.
>
I was having a look at this patch, and this kept me wondering,

+static void
+ExplainShowSettings(ExplainState *es)
+{
Is there some reason for not providing any explanation above this
function just like the rest of the functions in this file?

Similarly, for

struct config_generic **
get_explain_guc_options(int *num)
{

/* also bail out of there are no options */
+ if (!num)
+ return;
I think you meant 'if' instead if 'of' here.



Re: Compressed TOAST Slicing

2018-12-02 Thread Rafia Sabih
On Fri, Nov 2, 2018 at 11:55 PM Paul Ramsey  wrote:
>
> As threatened, I have also added a patch to left() to also use sliced access.

Hi Paul,

The idea looks good and believing your performance evaluation it seems
like a practical one too.

I had a look at this patch and here are my initial comments,
1.
- if (dp != destend || sp != srcend)
+ if (!is_slice && (dp != destend || sp != srcend))
  return -1;

A comment explaining how this check differs for is_slice case would be helpful.
2.
- int len = VARSIZE_ANY_EXHDR(str);
- int n = PG_GETARG_INT32(1);
- int rlen;
+ int n = PG_GETARG_INT32(1);

Looks like PG indentation is not followed here for n.

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/



Re: Latest HEAD fails to build

2018-09-10 Thread Rafia Sabih
On Mon, Sep 10, 2018 at 4:10 PM, Christoph Berg  wrote:

> Re: Rafia Sabih 2018-09-10  8hfpxsrr6mxab54ebju...@mail.gmail.com>
> > Thanks for the interest and help you offered. But I didn't quite get it,
> I
> > tried maintaner-clean and distclean but it didn't work.
>
> git clean -xdf
>

Thanks Christoph, it worked.

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


Re: Latest HEAD fails to build

2018-09-10 Thread Rafia Sabih
On Mon, Sep 10, 2018 at 2:56 PM, Daniel Gustafsson  wrote:

> > On 10 Sep 2018, at 11:22, Rafia Sabih 
> wrote:
> >
> > Hi all,
> >
> > After pulling the the latest commit -- 
> > e3d77ea6b4e425093db23be492f236896dd7b501,
> I am getting following error on compiling,
> > cp: cannot stat ‘./dynloader.h’: No such file or directory
> >
> > Things were working fine till ac27c74def5d8544530b13d5901308a342f072ac
> atleast.
> >
> > Anybody having a clue about that…
>
> I ran into the same thing this morning.  I think you have a leftover build
> artefact in src/include/dynloader.h from a build prior to it’s removal.
> Try to
> remove the file and make install again.
>
> Hi Daniel,

Thanks for the interest and help you offered. But I didn't quite get it, I
tried maintaner-clean and distclean but it didn't work.

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


Latest HEAD fails to build

2018-09-10 Thread Rafia Sabih
Hi all,

After pulling the the latest commit --
e3d77ea6b4e425093db23be492f236896dd7b501, I am getting following error on
compiling,
cp: cannot stat ‘./dynloader.h’: No such file or directory

Things were working fine till ac27c74def5d8544530b13d5901308a342f072ac
atleast.

Anybody having a clue about that...

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


Re: Hint to set owner for tablespace directory

2018-09-02 Thread Rafia Sabih
On Fri, Aug 31, 2018 at 4:11 PM, Maksim Milyutin 
wrote:

> On 08/31/2018 01:12 PM, Rafia Sabih wrote:
>
>
>
> On Fri, Aug 31, 2018 at 3:30 PM, Maksim Milyutin 
> wrote:
>
>> On 08/31/2018 11:55 AM, Rafia Sabih wrote:
>>
>>
>> Adding to that this patch needs a rebase. And, please don't forget to run
>> 'git diff --check', as it has some white-space issues.
>>
>>
>>  git apply /home/edb/Desktop/patches/others/owner_tbsp_hint.patch
>  /home/edb/Desktop/patches/others/owner_tbsp_hint.patch:9: trailing
> whitespace.
> {
> /home/edb/Desktop/patches/others/owner_tbsp_hint.patch:10: trailing
> whitespace.
> bool wrong_owner = (errno == EPERM);
>
>
> This is hex+ASCII display (from *hexdump -C owner_tbsp_hint.patch* output)
> of code fragment above:
>
> 01a0  0a 20 09 09 65 6c 73 65  0a 2b 09 09 7b 0a 2b 09  |.
> ..else.+..{.+.|
> 01b0  09 09 62 6f 6f 6c 20 77  72 6f 6e 67 5f 6f 77 6e  |..bool
> wrong_own|
> 01c0  65 72 20 3d 20 28 65 72  72 6e 6f 20 3d 3d 20 45  |er = (errno
> == E|
> 01d0  50 45 52 4d 29 3b 0a 2b  0a 20 09 09 09 65 72 65  |PERM);.+.
> ...ere|
>
> Problem lines don't have trailing whitespaces, only line feed (0x0a)
> symbols
>
> This is what I got. Please correct me if I am missng anything. I am using
> centos 7,  just an FYI.
>
>
> My git version is 2.7.4.
>
> I am using version 2.12.1, that might be the reason for the discrepancy.

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


Re: Hint to set owner for tablespace directory

2018-08-31 Thread Rafia Sabih
On Fri, Aug 31, 2018 at 3:30 PM, Maksim Milyutin 
wrote:

> On 08/31/2018 11:55 AM, Rafia Sabih wrote:
>
>
> Adding to that this patch needs a rebase. And, please don't forget to run
> 'git diff --check', as it has some white-space issues.
>
>
>  git apply /home/edb/Desktop/patches/others/owner_tbsp_hint.patch
 /home/edb/Desktop/patches/others/owner_tbsp_hint.patch:9: trailing
whitespace.
{
/home/edb/Desktop/patches/others/owner_tbsp_hint.patch:10: trailing
whitespace.
bool wrong_owner = (errno == EPERM);
/home/edb/Desktop/patches/others/owner_tbsp_hint.patch:11: trailing
whitespace.

/home/edb/Desktop/patches/others/owner_tbsp_hint.patch:16: trailing
whitespace.
location),
/home/edb/Desktop/patches/others/owner_tbsp_hint.patch:17: trailing
whitespace.
 wrong_owner ? errhint("Install the PostgreSQL system
user as "
error: patch failed: src/backend/commands/tablespace.c:585
error: src/backend/commands/tablespace.c: patch does not apply

This is what I got. Please correct me if I am missng anything. I am using
centos 7,  just an FYI.
--
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


Re: Hint to set owner for tablespace directory

2018-08-31 Thread Rafia Sabih
On Fri, Aug 24, 2018 at 3:05 PM, Maksim Milyutin 
wrote:

> On 08/24/2018 05:18 AM, Michael Paquier wrote:
>
> On Thu, Aug 23, 2018 at 02:24:25PM +0300, Maksim Milyutin wrote:
>>
>>> I want to add patch that prints hint to set required owner for the
>>> tablespace directory if this is the cause of the problem (*errno ==
>>> EPERM*
>>> after calling *chmod*).
>>>
>> Please do not forget to add this patch to the next commit fest.
>>
>
> Thanks, done.


Adding to that this patch needs a rebase. And, please don't forget to run
'git diff --check', as it has some white-space issues.

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


Re: [HACKERS] Partition-wise aggregation/grouping

2018-02-13 Thread Rafia Sabih
On Tue, Feb 13, 2018 at 6:21 PM, Jeevan Chalke <
jeevan.cha...@enterprisedb.com> wrote:

>
> I see that partition-wise aggregate plan too uses parallel index, am I
> missing something?
>
>
You're right, I missed that, oops.

>
>> Q18 takes some 390 secs with patch and some 147 secs without it.
>>
>
> This looks strange. This patch set does not touch parallel or seq scan as
> such. I am not sure why this is happening. All these three queries explain
> plan shows much higher execution time for parallel/seq scan.
>
> Yeah strange it is.

> However, do you see similar behaviour with patches applied,
> "enable_partition_wise_agg = on" and "enable_partition_wise_agg = off" ?
>

I tried that for query 18, with patch and  enable_partition_wise_agg = off,
query completes in some 270 secs. You may find the explain analyse output
for it in the attached file. I noticed that on head the query plan had
parallel hash join however with patch and no partition-wise agg it is using
nested loop joins. This might be the issue.

>
> Also, does rest of the queries perform better with partition-wise
> aggregates?
>
>
As far as this setting goes, there wasn't any other query using
partition-wise-agg, so, no.

BTW, just an FYI, this experiment is on scale factor 20.

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


18_pwa_off.out
Description: Binary data


Re: Shouldn't execParallel.c null-terminate query_string in the parallel DSM?

2017-12-19 Thread Rafia Sabih
On Wed, Dec 20, 2017 at 7:58 AM, Thomas Munro <thomas.mu...@enterprisedb.com
> wrote:

> Hi hackers,
>
> I just saw some trailing garbage in my log file emanating from a
> parallel worker when my query happened to be a BUFFERALIGNed length
> (in this case 64 characters).  Did commit 4c728f382970 forget to make
> sure the null terminator is copied?  See attached proposed fix.
>
> Yes, I missed that.
Your patch looks good to me, thanks.

-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/


Re: [HACKERS] [POC] Faster processing at Gather node

2017-11-23 Thread Rafia Sabih
On Thu, Nov 16, 2017 at 12:24 AM, Andres Freund <and...@anarazel.de> wrote:
> Hi,
>
> On 2017-11-15 13:48:18 -0500, Robert Haas wrote:
>> I think that we need a little bit deeper analysis here to draw any
>> firm conclusions.
>
> Indeed.
>
>
>> I suspect that one factor is that many of the queries actually send
>> very few rows through the Gather.
>
> Yep. I kinda wonder if the same result would present if the benchmarks
> were run with parallel_leader_participation. The theory being what were
> seing is just that the leader doesn't accept any tuples, and the large
> queue size just helps because workers can run for longer.
>
I ran Q12 with parallel_leader_participation = off and could not get
any performance improvement with the patches given by Robert.The
result was same for head as well. The query plan also remain
unaffected with the value of this parameter.

Here are the details of the experiment,
TPC-H scale factor = 20,
work_mem = 1GB
random_page_cost = seq_page_cost = 0.1
max_parallel_workers_per_gather = 4

PG commit: 745948422c799c1b9f976ee30f21a7aac050e0f3

Please find the attached file for the explain analyse output for
either values of parallel_leader_participation and patches.
-- 
Regards,
Rafia Sabih
EnterpriseDB: http://www.enterprisedb.com/
with parallel_leader_participation = 1;

 QUERY PLAN 
 
 
--
-
 Limit  (cost=1001.19..504469.79 rows=1 width=27) (actual 
time=21833.206..21833.207 rows=1 loops=1)
   ->  GroupAggregate  (cost=1001.19..3525281.42 rows=7 width=27) (actual 
time=21833.203..21833.203 rows=1 loops=1)
 Group Key: lineitem.l_shipmode
 ->  Gather Merge  (cost=1001.19..3515185.81 rows=576888 width=27) 
(actual time=4.388..21590.757 rows=311095 loops=1)
   Workers Planned: 4
   Workers Launched: 4
   ->  Nested Loop  (cost=1.13..3445472.82 rows=144222 width=27) 
(actual time=0.150..4399.384 rows=62247 loops=5)
 ->  Parallel Index Scan using l_shipmode on lineitem  
(cost=0.57..3337659.69 rows=144222 width=19) (actual time=0.111..3772.865 
rows=62247 loops=5)
   Index Cond: (l_shipmode = ANY ('{"REG 
AIR",RAIL}'::bpchar[]))
   Filter: ((l_commitdate < l_receiptdate) AND 
(l_shipdate < l_commitdate) AND (l_receiptdate >= '1995-01-01'::date) AND 
(l_receiptdate < '1996-01-01 00:00:00'::timestamp without
 time zone))
   Rows Removed by Filter: 3367603
 ->  Index Scan using orders_pkey on orders  
(cost=0.56..0.75 rows=1 width=20) (actual time=0.009..0.009 rows=1 loops=311236)
   Index Cond: (o_orderkey = lineitem.l_orderkey)
 Planning time: 0.526 ms
 Execution time: 21835.922 ms
(15 rows)

postgres=# set parallel_leader_participation =0;
SET
postgres=# \i /data/rafia.sabih/dss/queries/12.sql 

  QUERY PLAN
  
 
--
-
 Limit  (cost=1001.19..504469.79 rows=1 width=27) (actual 
time=21179.065..21179.066 rows=1 loops=1)
   ->  GroupAggregate  (cost=1001.19..3525281.42 rows=7 width=27) (actual 
time=21179.064..21179.064 rows=1 loops=1)
 Group Key: lineitem.l_shipmode
 ->  Gather Merge  (cost=1001.19..3515185.81 rows=576888 width=27) 
(actual time=4.201..20941.385 rows=311095 loops=1)
   Workers Planned: 4
   Workers Launched: 4
   ->  Nested Loop  (cost=1.13..3445472.82 rows=144222 width=27) 
(actual time=0.187..5105.780 rows=77797 loops=4)
 ->  Parallel Index Scan using l_shipmode on lineitem  
(cost=0.57..3337659.69 rows=144222 width=19) (actual time=0.149..4362.235 
rows=77797 loops=4)
   Index Cond: (l_shipmode = ANY ('{"REG 
AIR",RAIL}'::bpchar[]))
   Filter: ((l_commitdate < l_receiptdate) AND 
(l_shipdate < l_commitdate) AND (l_receiptdate >= '1995-01-01'::date) AND 
(l_receiptdate < '1996-01-01 00:00:00'::timestamp without
 time zone))
   Rows Removed by Filter: 4208802
 ->  Index Scan