Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-07 Thread hubert depesz lubaczewski
On Tue, Sep 06, 2011 at 09:21:02PM -0400, Bruce Momjian wrote:
 Tom Lane wrote:
  hubert depesz lubaczewski dep...@depesz.com writes:
   Worked a bit to get the ltree problem down to smallest possible, 
   repeatable, situation.
  
  I looked at this again and verified that indeed, commit
  8eee65c996048848c20f6637c1d12b319a4ce244 introduced an incompatible
  change into the on-disk format of ltree columns: it widened
  ltree_level.len, which is one component of an ltree on disk.
  So the crash is hardly surprising.  I think that the only thing
  pg_upgrade could do about it is refuse to upgrade when ltree columns
  are present in an 8.3 database.  I'm not sure though how you'd identify
  contrib/ltree versus some random user-defined type named ltree.
 
 It is actually easy to do using the attached patch.  I check for the
 functions that support the data type and check of they are from an
 'ltree' shared object.  I don't check actual user table type names in
 this case.

While it will prevent failures in future, it doesn't solve my problem
now :(

Will try to do it via:
- drop indexes on ltree
- convert ltree to text
- upgrade
- convert text to ltree
- create indexes on ltree

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-07 Thread Bruce Momjian
Bruce Momjian wrote:
 Tom Lane wrote:
  hubert depesz lubaczewski dep...@depesz.com writes:
   Worked a bit to get the ltree problem down to smallest possible, 
   repeatable, situation.
  
  I looked at this again and verified that indeed, commit
  8eee65c996048848c20f6637c1d12b319a4ce244 introduced an incompatible
  change into the on-disk format of ltree columns: it widened
  ltree_level.len, which is one component of an ltree on disk.
  So the crash is hardly surprising.  I think that the only thing
  pg_upgrade could do about it is refuse to upgrade when ltree columns
  are present in an 8.3 database.  I'm not sure though how you'd identify
  contrib/ltree versus some random user-defined type named ltree.
 
 It is actually easy to do using the attached patch.  I check for the
 functions that support the data type and check of they are from an
 'ltree' shared object.  I don't check actual user table type names in
 this case.

Attached patch applied to 9.0, 9.1, and HEAD.  Doc changes included.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
new file mode 100644
index 37c38c1..720f130
*** a/contrib/pg_upgrade/check.c
--- b/contrib/pg_upgrade/check.c
*** check_old_cluster(migratorContext *ctx, 
*** 72,77 
--- 72,78 
  	{
  		old_8_3_check_for_name_data_type_usage(ctx, CLUSTER_OLD);
  		old_8_3_check_for_tsquery_usage(ctx, CLUSTER_OLD);
+ 		old_8_3_check_ltree_usage(ctx, CLUSTER_OLD);
  		if (ctx-check)
  		{
  			old_8_3_rebuild_tsvector_tables(ctx, true, CLUSTER_OLD);
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
new file mode 100644
index 41c4b11..7a02fa1
*** a/contrib/pg_upgrade/pg_upgrade.h
--- b/contrib/pg_upgrade/pg_upgrade.h
*** void old_8_3_check_for_name_data_type_us
*** 394,399 
--- 394,401 
  	   Cluster whichCluster);
  void old_8_3_check_for_tsquery_usage(migratorContext *ctx,
  Cluster whichCluster);
+ void old_8_3_check_ltree_usage(migratorContext *ctx,
+ Cluster whichCluster);
  void old_8_3_rebuild_tsvector_tables(migratorContext *ctx,
  bool check_mode, Cluster whichCluster);
  void old_8_3_invalidate_hash_gin_indexes(migratorContext *ctx,
diff --git a/contrib/pg_upgrade/version_old_8_3.c b/contrib/pg_upgrade/version_old_8_3.c
new file mode 100644
index 6fcd61b..7e3a7aa
*** a/contrib/pg_upgrade/version_old_8_3.c
--- b/contrib/pg_upgrade/version_old_8_3.c
*** old_8_3_check_for_tsquery_usage(migrator
*** 204,209 
--- 204,289 
  
  
  /*
+  *	old_8_3_check_ltree_usage()
+  *	8.3 - 8.4
+  *	The internal ltree structure was changed in 8.4 so upgrading is impossible.
+  */
+ void
+ old_8_3_check_ltree_usage(migratorContext *ctx, Cluster whichCluster)
+ {
+ 	ClusterInfo *active_cluster = (whichCluster == CLUSTER_OLD) ?
+ 	ctx-old : ctx-new;
+ 	int			dbnum;
+ 	FILE	   *script = NULL;
+ 	bool		found = false;
+ 	char		output_path[MAXPGPATH];
+ 
+ 	prep_status(ctx, Checking for /contrib/ltree);
+ 
+ 	snprintf(output_path, sizeof(output_path), %s/contrib_ltree.txt,
+ 			 ctx-cwd);
+ 
+ 	for (dbnum = 0; dbnum  active_cluster-dbarr.ndbs; dbnum++)
+ 	{
+ 		PGresult   *res;
+ 		bool		db_used = false;
+ 		int			ntups;
+ 		int			rowno;
+ 		int			i_nspname,
+ 	i_proname;
+ 		DbInfo	   *active_db = active_cluster-dbarr.dbs[dbnum];
+ 		PGconn	   *conn = connectToServer(ctx, active_db-db_name, whichCluster);
+ 
+ 		/* Find any functions coming from contrib/ltree */
+ 		res = executeQueryOrDie(ctx, conn,
+ SELECT n.nspname, p.proname 
+ FROM	pg_catalog.pg_proc p, 
+ 		pg_catalog.pg_namespace n 
+ WHERE	p.pronamespace = n.oid AND 
+ 		p.probin = '$libdir/ltree');
+ 
+ 		ntups = PQntuples(res);
+ 		i_nspname = PQfnumber(res, nspname);
+ 		i_proname = PQfnumber(res, proname);
+ 		for (rowno = 0; rowno  ntups; rowno++)
+ 		{
+ 			found = true;
+ 			if (script == NULL  (script = fopen(output_path, w)) == NULL)
+ pg_log(ctx, PG_FATAL, Could not create necessary file:  %s\n, output_path);
+ 			if (!db_used)
+ 			{
+ fprintf(script, Database:  %s\n, active_db-db_name);
+ db_used = true;
+ 			}
+ 			fprintf(script,   %s.%s\n,
+ 	PQgetvalue(res, rowno, i_nspname),
+ 	PQgetvalue(res, rowno, i_proname));
+ 		}
+ 
+ 		PQclear(res);
+ 
+ 		PQfinish(conn);
+ 	}
+ 
+ 	if (found)
+ 	{
+ 		fclose(script);
+ 		pg_log(ctx, PG_REPORT, fatal\n);
+ 		pg_log(ctx, PG_FATAL,
+ 			   | Your installation contains the \ltree\ data type.  This data type\n
+ 			   | changed its internal storage format between your old and new clusters so this\n
+ 			   | cluster cannot currently be upgraded.  You can manually upgrade databases\n
+ 			   | that use \contrib/ltree\ facilities and remove \contrib/ltree\ from the old\n
+ 			   | cluster and restart the 

Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-06 Thread Peter Eisentraut
On mån, 2011-09-05 at 16:53 -0400, Bruce Momjian wrote:
 hubert depesz lubaczewski wrote:
   Good.  Is it possible to compile with debug symbols, -g?  Odd you are
   crashing in libc.
  
  this had debug:
  
  ./configure \
  --prefix=/opt/pgsql-9.0.5a-int \
  --enable-debug \
  --disable-rpath \
  --without-perl \
  --without-python \
  --without-tcl \
  --without-openssl \
  --without-pam \
  --without-krb5 \
  --without-gssapi \
  --enable-nls \
  --enable-integer-datetimes \
  --enable-thread-safety \
  --with-libxml \
  --with-libxslt \
  --without-ldap
 
 --enable-debug adds internal debug calls, not compiler debug symbols.

No, you have that backwards.


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-06 Thread hubert depesz lubaczewski
On Mon, Sep 05, 2011 at 05:26:00PM -0400, Tom Lane wrote:
 Bruce Momjian br...@momjian.us writes:
  Odd it is dying in the memory freeing at executor close --- not in the
  ltree code.
 
 Doesn't seem odd.  The glibc complaint previously shown already
 indicates this is a memory stomp problem.
 
 --enable-cassert might (or might not) provide additional help.

recompiled with cassert.

result:

=# select * from categories where category_id = 177;
The connection to the server was lost. Attempting reset: Succeeded.

which is interesting, as the error is different.

logs show:

2011-09-06 10:28:58 UTC () [21986]: [1-1] user=[unknown],db=[unknown] LOG:  
connection received: host=[local]
2011-09-06 10:28:58 UTC ([local]) [21986]: [2-1] user=postgres,db=xxx LOG:  
connection authorized: user=postgres database=xxx
2011-09-06 10:28:58 UTC () [21977]: [2-1] user=,db= LOG:  server process (PID 
21985) was terminated by signal 11: Segmentation fault
2011-09-06 10:28:58 UTC () [21977]: [3-1] user=,db= LOG:  terminating any other 
active server processes
2011-09-06 10:28:58 UTC ([local]) [21986]: [3-1] user=postgres,db=xxx 
WARNING:  terminating connection because of crash of another server process
2011-09-06 10:28:58 UTC ([local]) [21986]: [4-1] user=postgres,db=xxx 
DETAIL:  The postmaster has commanded this server process to roll back the 
current transaction and exit, because another server process exited abnormally 
and possibly corrupted shared memory.
2011-09-06 10:28:58 UTC ([local]) [21986]: [5-1] user=postgres,db=xxx HINT: 
 In a moment you should be able to reconnect to the database and repeat your 
command.

gdb backtrace is even less helpful:

$ gdb -q -c core* /opt/pgsql-9.0.5a-int/bin/postgres

warning: Can't read pathname for load map: Input/output error.
Reading symbols from /usr/lib/libxml2.so.2...done.
Loaded symbols for /usr/lib/libxml2.so.2
Reading symbols from /lib/libdl.so.2...done.
Loaded symbols for /lib/libdl.so.2
Reading symbols from /lib/libm.so.6...done.
Loaded symbols for /lib/libm.so.6
Reading symbols from /lib/libc.so.6...done.
Loaded symbols for /lib/libc.so.6
Reading symbols from /usr/lib/libz.so.1...done.
Loaded symbols for /usr/lib/libz.so.1
Reading symbols from /lib/ld-linux-x86-64.so.2...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /lib/libnss_files.so.2...done.
Loaded symbols for /lib/libnss_files.so.2
Core was generated by `postgres: postgres xxx [local] SELECT '.
Program terminated with signal 11, Segmentation fault.
[New process 21985]
#0  0x7fe18c235e4b in memcpy () from /lib/libc.so.6
(gdb) bt
#0  0x7fe18c235e4b in memcpy () from /lib/libc.so.6
#1  0x7fe1897532e4 in ?? ()
#2  0x in ?? ()
(gdb) 

Best regards,

depesz

-- 
The best thing about modern society is how easy it is to avoid contact with it.
 http://depesz.com/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-06 Thread hubert depesz lubaczewski
Hi,

Worked a bit to get the ltree problem down to smallest possible, repeatable, 
situation.

Initial setup:

1. PostgreSQL 8.3.11, configured with:
./configure\
--prefix=/opt/pgsql-8.3.11-int \
--disable-rpath\
--without-perl \
--without-python   \
--without-tcl  \
--without-openssl  \
--without-pam  \
--without-krb5 \
--without-gssapi   \
--enable-nls   \
--enable-integer-datetimes \
--enable-thread-safety \
--with-libxml  \
--with-libxslt \
--without-ldap
Built and installed with contrib modules.

2. PostgreSQL 9.0.5 (pre), from git checkout, head of 9.0 branch. Configured 
with:
./configure \
--prefix=/opt/pgsql-9.0.5a-int \
--enable-debug \
--enable-cassert \
--disable-rpath \
--without-perl \
--without-python \
--without-tcl \
--without-openssl \
--without-pam \
--without-krb5 \
--without-gssapi \
--enable-nls \
--enable-integer-datetimes \
--enable-thread-safety \
--with-libxml \
--with-libxslt \
--without-ldap

Now with these two in place, I can make the test:

=$ mkdir /var/tmp/test
=$ cd /var/tmp/test/
=$ export LD_LIBRARY_PATH=/opt/pgsql-8.3.11-int/lib/
=$ export PATH=/opt/pgsql-8.3.11-int/bin:$PATH
=$ mkdir data-8.3
=$ initdb -D data-8.3
=$ pg_ctl -D data-8.3 -l logfile-8.3 start
=$ psql -d postgres -f /opt/pgsql-8.3.11-int/share/contrib/ltree.sql
=$ psql -d postgres -c create table z (x ltree)
=$ psql -d postgres -c insert into z (x) values ('a.b')
=$ pg_ctl -D data-8.3/ stop
=$ export LD_LIBRARY_PATH=/opt/pgsql-9.0.5a-int/lib/:/opt/pgsql-8.3.11-int/lib/
=$ export 
PATH=/opt/pgsql-9.0.5a-int/bin/:/opt/pgsql-8.3.11-int/bin/:~/bin:/usr/local/bin:/usr/bin:/bin
=$ mkdir data-9.0
=$ initdb -D data-9.0/
=$ perl -pi -e 's/#port = 5432/port=7654/' data-9.0/postgresql.conf
=$ pg_upgrade -v -b /opt/pgsql-8.3.11-int/bin/ -B /opt/pgsql-9.0.5a-int/bin/ -d 
$( pwd )/data-8.3/ -D $( pwd )/data-9.0 -k -l pg_upgrade.log -p 5432 -P 7654
=$ pg_ctl -D data-9.0 -l logfile-9.0 start
=$ psql -p 7654 -d postgres -c select * from z
WARNING:  detected write past chunk end in ExecutorState 0xc500a0
WARNING:  problem in alloc set ExecutorState: req size  alloc size for chunk 
0xc500d8 in block 0xc4dfd0
WARNING:  problem in alloc set ExecutorState: bad single-chunk 0xc500d8 in 
block 0xc4dfd0
WARNING:  problem in alloc set ExecutorState: bogus aset link in block 
0xc4dfd0, chunk 0xc500d8
WARNING:  problem in alloc set ExecutorState: found inconsistent memory block 
0xc4dfd0
connection to server was lost

Hope it helps.

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-06 Thread Tom Lane
hubert depesz lubaczewski dep...@depesz.com writes:
 Worked a bit to get the ltree problem down to smallest possible, repeatable, 
 situation.

I looked at this again and verified that indeed, commit
8eee65c996048848c20f6637c1d12b319a4ce244 introduced an incompatible
change into the on-disk format of ltree columns: it widened
ltree_level.len, which is one component of an ltree on disk.
So the crash is hardly surprising.  I think that the only thing
pg_upgrade could do about it is refuse to upgrade when ltree columns
are present in an 8.3 database.  I'm not sure though how you'd identify
contrib/ltree versus some random user-defined type named ltree.

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-06 Thread Bruce Momjian
Tom Lane wrote:
 hubert depesz lubaczewski dep...@depesz.com writes:
  Worked a bit to get the ltree problem down to smallest possible, 
  repeatable, situation.
 
 I looked at this again and verified that indeed, commit
 8eee65c996048848c20f6637c1d12b319a4ce244 introduced an incompatible
 change into the on-disk format of ltree columns: it widened
 ltree_level.len, which is one component of an ltree on disk.
 So the crash is hardly surprising.  I think that the only thing
 pg_upgrade could do about it is refuse to upgrade when ltree columns
 are present in an 8.3 database.  I'm not sure though how you'd identify
 contrib/ltree versus some random user-defined type named ltree.

It is actually easy to do using the attached patch.  I check for the
functions that support the data type and check of they are from an
'ltree' shared object.  I don't check actual user table type names in
this case.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
new file mode 100644
index 93b9e69..91789d3
*** a/contrib/pg_upgrade/check.c
--- b/contrib/pg_upgrade/check.c
*** check_old_cluster(bool live_check, char 
*** 81,86 
--- 81,87 
  	{
  		old_8_3_check_for_name_data_type_usage(old_cluster);
  		old_8_3_check_for_tsquery_usage(old_cluster);
+ 		old_8_3_check_ltree(old_cluster);
  		if (user_opts.check)
  		{
  			old_8_3_rebuild_tsvector_tables(old_cluster, true);
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
new file mode 100644
index feea324..82e16ce
*** a/contrib/pg_upgrade/pg_upgrade.h
--- b/contrib/pg_upgrade/pg_upgrade.h
*** void new_9_0_populate_pg_largeobject_met
*** 411,416 
--- 411,417 
  
  void		old_8_3_check_for_name_data_type_usage(ClusterInfo *cluster);
  void		old_8_3_check_for_tsquery_usage(ClusterInfo *cluster);
+ void		old_8_3_check_ltree(ClusterInfo *cluster);
  void		old_8_3_rebuild_tsvector_tables(ClusterInfo *cluster, bool check_mode);
  void		old_8_3_invalidate_hash_gin_indexes(ClusterInfo *cluster, bool check_mode);
  void old_8_3_invalidate_bpchar_pattern_ops_indexes(ClusterInfo *cluster,
diff --git a/contrib/pg_upgrade/version_old_8_3.c b/contrib/pg_upgrade/version_old_8_3.c
new file mode 100644
index 55d919c..ba14dc9
*** a/contrib/pg_upgrade/version_old_8_3.c
--- b/contrib/pg_upgrade/version_old_8_3.c
*** old_8_3_check_for_tsquery_usage(ClusterI
*** 202,207 
--- 202,288 
  
  
  /*
+  *	old_8_3_check_ltree()
+  *	8.3 - 8.4
+  *	The internal ltree structure was changed in 8.4 so upgrading is impossible.
+  */
+ void
+ old_8_3_check_ltree(ClusterInfo *cluster)
+ {
+ 	int			dbnum;
+ 	FILE	   *script = NULL;
+ 	bool		found = false;
+ 	char		output_path[MAXPGPATH];
+ 
+ 	prep_status(Checking for contrib/ltree);
+ 
+ 	snprintf(output_path, sizeof(output_path), %s/contrib_ltree.txt,
+ 			 os_info.cwd);
+ 
+ 	for (dbnum = 0; dbnum  cluster-dbarr.ndbs; dbnum++)
+ 	{
+ 		PGresult   *res;
+ 		bool		db_used = false;
+ 		int			ntups;
+ 		int			rowno;
+ 		int			i_nspname,
+ 	i_proname;
+ 		DbInfo	   *active_db = cluster-dbarr.dbs[dbnum];
+ 		PGconn	   *conn = connectToServer(cluster, active_db-db_name);
+ 
+ 		/* Find any functions coming from contrib/ltree */
+ 		res = executeQueryOrDie(conn,
+ SELECT n.nspname, p.proname 
+ FROM	pg_catalog.pg_proc p, 
+ 		pg_catalog.pg_namespace n 
+ WHERE	p.pronamespace = n.oid AND 
+ 		p.probin = '$libdir/ltree');
+ 
+ 		ntups = PQntuples(res);
+ 		i_nspname = PQfnumber(res, nspname);
+ 		i_proname = PQfnumber(res, proname);
+ 		for (rowno = 0; rowno  ntups; rowno++)
+ 		{
+ 			found = true;
+ 			if (script == NULL  (script = fopen(output_path, w)) == NULL)
+ pg_log(PG_FATAL, Could not open file \%s\: %s\n,
+ 	   output_path, getErrorText(errno));
+ 			if (!db_used)
+ 			{
+ fprintf(script, Database: %s\n, active_db-db_name);
+ db_used = true;
+ 			}
+ 			fprintf(script,   %s.%s\n,
+ 	PQgetvalue(res, rowno, i_nspname),
+ 	PQgetvalue(res, rowno, i_proname));
+ 		}
+ 
+ 		PQclear(res);
+ 
+ 		PQfinish(conn);
+ 	}
+ 
+ 	if (script)
+ 		fclose(script);
+ 
+ 	if (found)
+ 	{
+ 		pg_log(PG_REPORT, fatal\n);
+ 		pg_log(PG_FATAL,
+ 			   Your installation contains the \ltree\ data type.This data type\n
+ 			   changed its internal storage format between your old and new clusters so this\n
+ 			   cluster cannot currently be upgraded.  You can manually upgrade databases\n
+ 			   that use \contrib/isn\ facilities and remove \contrib/isn\ from the old\n
+ 			   cluster and restart the upgrade.  A list of the problem functions is in the\n
+ 			   file:\n
+ 			   %s\n\n, output_path);
+ 	}
+ 	else
+ 		check_ok();
+ }
+ 
+ 
+ /*
   * old_8_3_rebuild_tsvector_tables()
   *	8.3 - 8.4
   * 8.3 sorts lexemes by its length and 

Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread hubert depesz lubaczewski
On Thu, Sep 01, 2011 at 08:05:51AM +0200, hubert depesz lubaczewski wrote:
 On Wed, Aug 31, 2011 at 09:54:20PM -0400, Bruce Momjian wrote:
  Working with depesz, I have found the cause.  The code I added to fix
  pg_upgrade in 9.0.4 and earlier releases didn't handle old 8.3 servers
  properly.  I mistakenly processed toast table with the same pg_dump
  query as used for pre-8.4 toast tables, not realizing those were not
  functional because there were no reloptions for toast tables in pre-8.4.
 
 Thanks a lot. Will test and post results (around sunday/monday I guess).

All worked.
pg_upgrade/vacuum didn't raise any errors. Will check some random
queries too, but don't expect anything to break.

thanks again for quick help.

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Thu, Sep 01, 2011 at 08:05:51AM +0200, hubert depesz lubaczewski wrote:
  On Wed, Aug 31, 2011 at 09:54:20PM -0400, Bruce Momjian wrote:
   Working with depesz, I have found the cause.  The code I added to fix
   pg_upgrade in 9.0.4 and earlier releases didn't handle old 8.3 servers
   properly.  I mistakenly processed toast table with the same pg_dump
   query as used for pre-8.4 toast tables, not realizing those were not
   functional because there were no reloptions for toast tables in pre-8.4.
  
  Thanks a lot. Will test and post results (around sunday/monday I guess).
 
 All worked.
 pg_upgrade/vacuum didn't raise any errors. Will check some random
 queries too, but don't expect anything to break.

Thanks.  I will announce the known bug and the fix.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread hubert depesz lubaczewski
On Mon, Sep 05, 2011 at 05:48:50PM +0200, hubert depesz lubaczewski wrote:
 On Thu, Sep 01, 2011 at 08:05:51AM +0200, hubert depesz lubaczewski wrote:
  On Wed, Aug 31, 2011 at 09:54:20PM -0400, Bruce Momjian wrote:
   Working with depesz, I have found the cause.  The code I added to fix
   pg_upgrade in 9.0.4 and earlier releases didn't handle old 8.3 servers
   properly.  I mistakenly processed toast table with the same pg_dump
   query as used for pre-8.4 toast tables, not realizing those were not
   functional because there were no reloptions for toast tables in pre-8.4.
  
  Thanks a lot. Will test and post results (around sunday/monday I guess).
 
 All worked.
 pg_upgrade/vacuum didn't raise any errors. Will check some random
 queries too, but don't expect anything to break.

Hmm .. got breakage.

Have table with ltree column, and any select to it causes:

=# select * from categories limit 1;
The connection to the server was lost. Attempting reset: Failed.

strace shows that backend read table, then it opened correct ltree.so,
but then:

29293 17:49:00.667865 stat(/opt/pgsql-9.0.5a-int/lib/ltree, 0x7fffb026ceb0) = 
-1 ENOENT (No such file or directory) 0.13
29293 17:49:00.667935 stat(/opt/pgsql-9.0.5a-int/lib/ltree.so, 
{st_mode=S_IFREG|0755, st_size=72966, ...}) = 0 0.10
29293 17:49:00.668007 stat(/opt/pgsql-9.0.5a-int/lib/ltree.so, 
{st_mode=S_IFREG|0755, st_size=72966, ...}) = 0 0.09
29293 17:49:00.668135 open(/opt/pgsql-9.0.5a-int/lib/ltree.so, O_RDONLY) = 46 
0.12
29293 17:49:00.668181 read(46, 
\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0\0\1\0\0\0\240.\0\0..., 832) = 832 
0.08
29293 17:49:00.668227 fstat(46, {st_mode=S_IFREG|0755, st_size=72966, ...}) = 0 
0.06
29293 17:49:00.668294 mmap(NULL, 2153248, PROT_READ|PROT_EXEC, 
MAP_PRIVATE|MAP_DENYWRITE, 46, 0) = 0x7feba4abc000 0.13
29293 17:49:00.668341 mprotect(0x7feba4aca000, 2093056, PROT_NONE) = 0 
0.12
29293 17:49:00.668381 mmap(0x7feba4cc9000, 4096, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 46, 0xd000) = 0x7feba4cc9000 0.12
29293 17:49:00.668429 close(46) = 0 0.07
29293 17:49:00.668715 open(/dev/tty, O_RDWR|O_NOCTTY|O_NONBLOCK) = -1 ENXIO 
(No such device or address) 0.17
29293 17:49:00.668771 writev(2, [{*** glibc detected *** , 23}, {postgres: 
postgres xxx [loca..., 41}, {: , 2}, {double free or corruption 
(!prev..., 33}, {: 0x, 4}, {00be67a0, 16}, { ***\n, 5}], 7) = 
124 0.14
29293 17:49:00.668863 open(/opt/pgsql-9.0.5a-int/lib/libgcc_s.so.1, O_RDONLY) 
= -1 ENOENT (No such file or directory) 0.10
29293 17:49:00.668907 open(/opt/pgsql-8.3.11-int/lib/libgcc_s.so.1, O_RDONLY) 
= -1 ENOENT (No such file or directory) 0.12
29293 17:49:00.668952 open(/etc/ld.so.cache, O_RDONLY) = 46 0.10
29293 17:49:00.668990 fstat(46, {st_mode=S_IFREG|0644, st_size=17400, ...}) = 0 
0.06
29293 17:49:00.669044 mmap(NULL, 17400, PROT_READ, MAP_PRIVATE, 46, 0) = 
0x7feba80d7000 0.08
29293 17:49:00.669077 close(46) = 0 0.06
29293 17:49:00.669110 access(/etc/ld.so.nohwcap, F_OK) = -1 ENOENT (No such 
file or directory) 0.09
29293 17:49:00.669156 open(/lib/libgcc_s.so.1, O_RDONLY) = 46 0.12
29293 17:49:00.669197 read(46, 
\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0\0\1\0\0\0\240!\0\0..., 832) = 832 
0.09
29293 17:49:00.669244 mmap(NULL, 134217728, PROT_NONE, 
MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7feb9cabc000 0.07
29293 17:49:00.669278 munmap(0x7feb9cabc000, 55853056) = 0 0.11
29293 17:49:00.669313 munmap(0x7feba400, 11255808) = 0 0.08
29293 17:49:00.669347 mprotect(0x7feba000, 135168, PROT_READ|PROT_WRITE) = 
0 0.08
29293 17:49:00.669387 fstat(46, {st_mode=S_IFREG|0644, st_size=56072, ...}) = 0 
0.06
29293 17:49:00.669451 mmap(NULL, 2151816, PROT_READ|PROT_EXEC, 
MAP_PRIVATE|MAP_DENYWRITE, 46, 0) = 0x7feba48ae000 0.09
29293 17:49:00.669487 mprotect(0x7feba48bb000, 2097152, PROT_NONE) = 0 
0.09
29293 17:49:00.669522 mmap(0x7feba4abb000, 4096, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 46, 0xd000) = 0x7feba4abb000 0.09
29293 17:49:00.669565 close(46) = 0 0.06
29293 17:49:00.669614 munmap(0x7feba80d7000, 17400) = 0 0.12
29293 17:49:00.669765 write(2, === Backtrace: =\n, 29) = 29 
0.11
29293 17:49:00.669852 writev(2, [{/lib/libc.so.6, 14}, {[0x, 3}, 
{7feba759908a, 12}, {]\n, 2}], 4) = 31 0.11
29293 17:49:00.669937 writev(2, [{/lib/libc.so.6, 14}, {(, 1}, {cfree, 
5}, {+0x, 3}, {8c, 2}, {), 1}, {[0x, 3}, {7feba759cc1c, 12}, {]\n, 
2}], 9) = 43 0.12
29293 17:49:00.670128 writev(2, [{postgres: postgres xxx [loca..., 41}, 
{[0x, 3}, {6c18c9, 6}, {]\n, 2}], 4) = 52 0.11
29293 17:49:00.670289 writev(2, [{postgres: postgres xxx [loca..., 41}, 
{(, 1}, {MemoryContextDelete, 19}, {+0x, 3}, {54, 2}, {), 1}, {[0x, 
3}, {6c1e54, 6}, {]\n, 2}], 9) = 78 0.12
29293 17:49:00.670453 writev(2, [{postgres: postgres xxx [loca..., 41}, 
{(, 1}, 

Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
Bruce Momjian wrote:
 hubert depesz lubaczewski wrote:
  On Wed, Aug 31, 2011 at 01:23:05PM -0400, Bruce Momjian wrote:
   Can you get me the 9.0.X pg_class.relfrozenxid for the toast and heap
   tables involved?
  
  Sure:
  
  =# select oid::regclass, relfrozenxid from pg_class  where relname in 
  ('transactions', 'pg_toast_106668498');
   oid | relfrozenxid 
  -+--
   pg_toast.pg_toast_106668498 |   3673553926
   transactions|   3623560321
  (2 rows)
 
 Working with depesz, I have found the cause.  The code I added to fix
 pg_upgrade in 9.0.4 and earlier releases didn't handle old 8.3 servers
 properly.  I mistakenly processed toast table with the same pg_dump
 query as used for pre-8.4 toast tables, not realizing those were not
 functional because there were no reloptions for toast tables in pre-8.4.
 
 The attached applied patches fix all releases.  This will have to be
 mentioned in the 9.0.5 release notes, and we should probably do the same
 kind of announcement we did when I fixed this for 9.0.4.  :-(
 
 Yeah, I should not have caused this bug.  It did not show up in any of
 my testing.

I have posted the bug and fix announcement to the announce email list.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 I'm not sure if it's upgrade thing, or is it because of error in
 ltree/compilation, but it looks bad.
 
 Is there any more info I could show/gather to help debug the issue?

I am confused by the error --- is it not loading, or can you get a
backtrace of the crash?

If I had to take a guess, it would be that there is some ltree
incompatibility from PG 8.3 that we didn't know about.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread hubert depesz lubaczewski
On Mon, Sep 05, 2011 at 02:18:18PM -0400, Bruce Momjian wrote:
 hubert depesz lubaczewski wrote:
  I'm not sure if it's upgrade thing, or is it because of error in
  ltree/compilation, but it looks bad.
  
  Is there any more info I could show/gather to help debug the issue?
 
 I am confused by the error --- is it not loading, or can you get a
 backtrace of the crash?

The one in logs is not sufficient?
If not - could you tell me how to make the backtrace? I'm by far not a c
programmer, so for this I'd need some tutoring.

 If I had to take a guess, it would be that there is some ltree
 incompatibility from PG 8.3 that we didn't know about.

it's possible.

Best regards,

depesz

-- 
The best thing about modern society is how easy it is to avoid contact with it.
 http://depesz.com/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Mon, Sep 05, 2011 at 02:18:18PM -0400, Bruce Momjian wrote:
  hubert depesz lubaczewski wrote:
   I'm not sure if it's upgrade thing, or is it because of error in
   ltree/compilation, but it looks bad.
   
   Is there any more info I could show/gather to help debug the issue?
  
  I am confused by the error --- is it not loading, or can you get a
  backtrace of the crash?
 
 The one in logs is not sufficient?
 If not - could you tell me how to make the backtrace? I'm by far not a c
 programmer, so for this I'd need some tutoring.

I think you want this:


http://wiki.postgresql.org/wiki/Getting_a_stack_trace_of_a_running_PostgreSQL_backend_on_Linux/BSD

While strace is useful, it doesn't show us where the C code is failing.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Tom Lane
hubert depesz lubaczewski dep...@depesz.com writes:
 On Mon, Sep 05, 2011 at 02:18:18PM -0400, Bruce Momjian wrote:
 If I had to take a guess, it would be that there is some ltree
 incompatibility from PG 8.3 that we didn't know about.

 it's possible.

[ checks the git history... ]  This 8.4 commit:
http://git.postgresql.org/gitweb/?p=postgresql.gita=commitdiffh=8eee65c996048848c20f6637c1d12b319a4ce244
changed a number of ltree data structures, though I'm not sure whether
any of those are on-disk structures.

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread hubert depesz lubaczewski
On Mon, Sep 05, 2011 at 02:51:12PM -0400, Bruce Momjian wrote:
 hubert depesz lubaczewski wrote:
  On Mon, Sep 05, 2011 at 02:18:18PM -0400, Bruce Momjian wrote:
   hubert depesz lubaczewski wrote:
I'm not sure if it's upgrade thing, or is it because of error in
ltree/compilation, but it looks bad.

Is there any more info I could show/gather to help debug the issue?
   
   I am confused by the error --- is it not loading, or can you get a
   backtrace of the crash?
  
  The one in logs is not sufficient?
  If not - could you tell me how to make the backtrace? I'm by far not a c
  programmer, so for this I'd need some tutoring.
 
 I think you want this:
 
   
 http://wiki.postgresql.org/wiki/Getting_a_stack_trace_of_a_running_PostgreSQL_backend_on_Linux/BSD
 
 While strace is useful, it doesn't show us where the C code is failing.

ok.
got this:

(gdb) bt
#0  0x7fdc28605095 in raise () from /lib/libc.so.6
#1  0x7fdc28606af0 in abort () from /lib/libc.so.6
#2  0x7fdc2863fa7b in ?? () from /lib/libc.so.6
#3  0x7fdc2864708a in ?? () from /lib/libc.so.6
#4  0x7fdc2864ac1c in free () from /lib/libc.so.6
#5  0x006c18c9 in AllocSetDelete (context=value optimized out) at 
aset.c:551
#6  0x006c1e54 in MemoryContextDelete (context=0xbdae80) at mcxt.c:196
#7  0x0054913e in standard_ExecutorEnd (queryDesc=0xbbb4f0) at 
execMain.c:360
#8  0x0051c88f in PortalCleanup (portal=0xbb7a70) at portalcmds.c:268
#9  0x006c26fc in PortalDrop (portal=0xbb7a70, isTopCommit=0 '\0') at 
portalmem.c:434
#10 0x005f8c95 in exec_simple_query (query_string=0xb9b980 select * 
from categories limit 1;) at postgres.c:1067
#11 0x005f95de in PostgresMain (argc=value optimized out, argv=value 
optimized out, username=value optimized out) at postgres.c:3936
#12 0x005c94f6 in ServerLoop () at postmaster.c:3555
#13 0x005ca0fe in PostmasterMain (argc=3, argv=0xaf0870) at 
postmaster.c:1092
#14 0x00574070 in main (argc=3, argv=0xaf0870) at main.c:188

Hope it helps.

Best regards,

depesz

-- 
The best thing about modern society is how easy it is to avoid contact with it.
 http://depesz.com/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Mon, Sep 05, 2011 at 02:51:12PM -0400, Bruce Momjian wrote:
  hubert depesz lubaczewski wrote:
   On Mon, Sep 05, 2011 at 02:18:18PM -0400, Bruce Momjian wrote:
hubert depesz lubaczewski wrote:
 I'm not sure if it's upgrade thing, or is it because of error in
 ltree/compilation, but it looks bad.
 
 Is there any more info I could show/gather to help debug the issue?

I am confused by the error --- is it not loading, or can you get a
backtrace of the crash?
   
   The one in logs is not sufficient?
   If not - could you tell me how to make the backtrace? I'm by far not a c
   programmer, so for this I'd need some tutoring.
  
  I think you want this:
  
  
  http://wiki.postgresql.org/wiki/Getting_a_stack_trace_of_a_running_PostgreSQL_backend_on_Linux/BSD
  
  While strace is useful, it doesn't show us where the C code is failing.
 
 ok.
 got this:
 
 (gdb) bt
 #0  0x7fdc28605095 in raise () from /lib/libc.so.6
 #1  0x7fdc28606af0 in abort () from /lib/libc.so.6
 #2  0x7fdc2863fa7b in ?? () from /lib/libc.so.6
 #3  0x7fdc2864708a in ?? () from /lib/libc.so.6
 #4  0x7fdc2864ac1c in free () from /lib/libc.so.6
 #5  0x006c18c9 in AllocSetDelete (context=value optimized out) at 
 aset.c:551
 #6  0x006c1e54 in MemoryContextDelete (context=0xbdae80) at mcxt.c:196
 #7  0x0054913e in standard_ExecutorEnd (queryDesc=0xbbb4f0) at 
 execMain.c:360
 #8  0x0051c88f in PortalCleanup (portal=0xbb7a70) at portalcmds.c:268
 #9  0x006c26fc in PortalDrop (portal=0xbb7a70, isTopCommit=0 '\0') at 
 portalmem.c:434
 #10 0x005f8c95 in exec_simple_query (query_string=0xb9b980 select * 
 from categories limit 1;) at postgres.c:1067
 #11 0x005f95de in PostgresMain (argc=value optimized out, 
 argv=value optimized out, username=value optimized out) at postgres.c:3936
 #12 0x005c94f6 in ServerLoop () at postmaster.c:3555
 #13 0x005ca0fe in PostmasterMain (argc=3, argv=0xaf0870) at 
 postmaster.c:1092
 #14 0x00574070 in main (argc=3, argv=0xaf0870) at main.c:188

Good.  Is it possible to compile with debug symbols, -g?  Odd you are
crashing in libc.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread hubert depesz lubaczewski
On Mon, Sep 05, 2011 at 04:43:47PM -0400, Bruce Momjian wrote:
 hubert depesz lubaczewski wrote:
  On Mon, Sep 05, 2011 at 02:51:12PM -0400, Bruce Momjian wrote:
   hubert depesz lubaczewski wrote:
On Mon, Sep 05, 2011 at 02:18:18PM -0400, Bruce Momjian wrote:
 hubert depesz lubaczewski wrote:
  I'm not sure if it's upgrade thing, or is it because of error in
  ltree/compilation, but it looks bad.
  
  Is there any more info I could show/gather to help debug the issue?
 
 I am confused by the error --- is it not loading, or can you get a
 backtrace of the crash?

The one in logs is not sufficient?
If not - could you tell me how to make the backtrace? I'm by far not a c
programmer, so for this I'd need some tutoring.
   
   I think you want this:
   
 
   http://wiki.postgresql.org/wiki/Getting_a_stack_trace_of_a_running_PostgreSQL_backend_on_Linux/BSD
   
   While strace is useful, it doesn't show us where the C code is failing.
  
  ok.
  got this:
  
  (gdb) bt
  #0  0x7fdc28605095 in raise () from /lib/libc.so.6
  #1  0x7fdc28606af0 in abort () from /lib/libc.so.6
  #2  0x7fdc2863fa7b in ?? () from /lib/libc.so.6
  #3  0x7fdc2864708a in ?? () from /lib/libc.so.6
  #4  0x7fdc2864ac1c in free () from /lib/libc.so.6
  #5  0x006c18c9 in AllocSetDelete (context=value optimized out) at 
  aset.c:551
  #6  0x006c1e54 in MemoryContextDelete (context=0xbdae80) at 
  mcxt.c:196
  #7  0x0054913e in standard_ExecutorEnd (queryDesc=0xbbb4f0) at 
  execMain.c:360
  #8  0x0051c88f in PortalCleanup (portal=0xbb7a70) at 
  portalcmds.c:268
  #9  0x006c26fc in PortalDrop (portal=0xbb7a70, isTopCommit=0 '\0') 
  at portalmem.c:434
  #10 0x005f8c95 in exec_simple_query (query_string=0xb9b980 select 
  * from categories limit 1;) at postgres.c:1067
  #11 0x005f95de in PostgresMain (argc=value optimized out, 
  argv=value optimized out, username=value optimized out) at 
  postgres.c:3936
  #12 0x005c94f6 in ServerLoop () at postmaster.c:3555
  #13 0x005ca0fe in PostmasterMain (argc=3, argv=0xaf0870) at 
  postmaster.c:1092
  #14 0x00574070 in main (argc=3, argv=0xaf0870) at main.c:188
 
 Good.  Is it possible to compile with debug symbols, -g?  Odd you are
 crashing in libc.

this had debug:

./configure \
--prefix=/opt/pgsql-9.0.5a-int \
--enable-debug \
--disable-rpath \
--without-perl \
--without-python \
--without-tcl \
--without-openssl \
--without-pam \
--without-krb5 \
--without-gssapi \
--enable-nls \
--enable-integer-datetimes \
--enable-thread-safety \
--with-libxml \
--with-libxslt \
--without-ldap

Best regards,

depesz

-- 
The best thing about modern society is how easy it is to avoid contact with it.
 http://depesz.com/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
  Good.  Is it possible to compile with debug symbols, -g?  Odd you are
  crashing in libc.
 
 this had debug:
 
 ./configure \
 --prefix=/opt/pgsql-9.0.5a-int \
 --enable-debug \
 --disable-rpath \
 --without-perl \
 --without-python \
 --without-tcl \
 --without-openssl \
 --without-pam \
 --without-krb5 \
 --without-gssapi \
 --enable-nls \
 --enable-integer-datetimes \
 --enable-thread-safety \
 --with-libxml \
 --with-libxslt \
 --without-ldap

--enable-debug adds internal debug calls, not compiler debug symbols.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Mon, Sep 05, 2011 at 02:51:12PM -0400, Bruce Momjian wrote:
  hubert depesz lubaczewski wrote:
   On Mon, Sep 05, 2011 at 02:18:18PM -0400, Bruce Momjian wrote:
hubert depesz lubaczewski wrote:
 I'm not sure if it's upgrade thing, or is it because of error in
 ltree/compilation, but it looks bad.
 
 Is there any more info I could show/gather to help debug the issue?

I am confused by the error --- is it not loading, or can you get a
backtrace of the crash?
   
   The one in logs is not sufficient?
   If not - could you tell me how to make the backtrace? I'm by far not a c
   programmer, so for this I'd need some tutoring.
  
  I think you want this:
  
  
  http://wiki.postgresql.org/wiki/Getting_a_stack_trace_of_a_running_PostgreSQL_backend_on_Linux/BSD
  
  While strace is useful, it doesn't show us where the C code is failing.
 
 ok.
 got this:
 
 (gdb) bt
 #0  0x7fdc28605095 in raise () from /lib/libc.so.6
 #1  0x7fdc28606af0 in abort () from /lib/libc.so.6
 #2  0x7fdc2863fa7b in ?? () from /lib/libc.so.6
 #3  0x7fdc2864708a in ?? () from /lib/libc.so.6
 #4  0x7fdc2864ac1c in free () from /lib/libc.so.6
 #5  0x006c18c9 in AllocSetDelete (context=value optimized out) at 
 aset.c:551
 #6  0x006c1e54 in MemoryContextDelete (context=0xbdae80) at mcxt.c:196
 #7  0x0054913e in standard_ExecutorEnd (queryDesc=0xbbb4f0) at 
 execMain.c:360
 #8  0x0051c88f in PortalCleanup (portal=0xbb7a70) at portalcmds.c:268
 #9  0x006c26fc in PortalDrop (portal=0xbb7a70, isTopCommit=0 '\0') at 
 portalmem.c:434
 #10 0x005f8c95 in exec_simple_query (query_string=0xb9b980 select * 
 from categories limit 1;) at postgres.c:1067
 #11 0x005f95de in PostgresMain (argc=value optimized out, 
 argv=value optimized out, username=value optimized out) at postgres.c:3936
 #12 0x005c94f6 in ServerLoop () at postmaster.c:3555
 #13 0x005ca0fe in PostmasterMain (argc=3, argv=0xaf0870) at 
 postmaster.c:1092
 #14 0x00574070 in main (argc=3, argv=0xaf0870) at main.c:188

Odd it is dying in the memory freeing at executor close --- not in the
ltree code.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Tom Lane
Bruce Momjian br...@momjian.us writes:
 Odd it is dying in the memory freeing at executor close --- not in the
 ltree code.

Doesn't seem odd.  The glibc complaint previously shown already
indicates this is a memory stomp problem.

--enable-cassert might (or might not) provide additional help.

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread daveg

Sorry I missed your reply, catching up now.

On Wed, Aug 31, 2011 at 09:56:59PM -0400, Bruce Momjian wrote:
 daveg wrote:
  On Mon, Aug 29, 2011 at 07:49:24PM +0200, hubert depesz lubaczewski wrote:
   On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
   vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not 
   access status of transaction 3429738606
   DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
   
   Interestingly.
   
   In old dir there is pg_clog directory with files:
   0AC0 .. 0DAF (including 0CC6, size 262144)
   but new pg_clog has only:
   0D2F .. 0DB0
   
   File content - nearly all files that exist in both places are the same, 
   with exception of 2 newest ones in new datadir:
   3c5122f3e80851735c19522065a2d12a  0DAF
   8651fc2b9fa3d27cfb5b496165cead68  0DB0
   
   0DB0 doesn't exist in old, and 0DAF has different md5sum: 
   7d48996c762d6a10f8eda88ae766c5dd
...
  I had this same thing happen this Saturday just past and my client had to
  restore the whole 2+ TB instance from the previous days pg_dumps.
...
  After running pg_upgrade apparently successfully and analyzeing all the

Update: reviewing the logs I see some of the analyzes hit the could not
access status of transaction error too.

  tables we restarted the production workload and started getting errors:
  
  2011-08-27 04:18:34.015  12337  c06  postgres  ERROR:  could not access 
  status of transaction 2923961093
  2011-08-27 04:18:34.015  12337  c06  postgres  DETAIL:  Could not open file 
  pg_clog/0AE4: No such file or directory.
  2011-08-27 04:18:34.015  12337  c06  postgres  STATEMENT:  analyze 
  public.b_pxx;
  
  On examination the pg_clog directory contained on two files timestamped
  after the startup of the new cluster with 9.0.4. Other hosts that upgraded
  successfully had numerous files in pg_clog dating back a few days. So it
  appears that all the clog files went missing during the upgrade somehow.
  a
  This happened upgrading from 8.4.7 to 9.0.4, with a brief session in between
  at 8.4.8. We have upgraded several hosts to 9.0.4 successfully previously.
 
 I have posted this fix to the hackers email list, but I found it only
 affected old 8.3 servers, not old 8.4.X, so I am confused by your bug
 report.
 
 I have tested 8.4.X to 9.0.4 and found pg_upgrade preserves toast
 relfrozenxids properly in that case.
 
 Can you tell me what table is showing this error?  Does it happen during
 vacuum?  Can you run a vacuum verbose to see what it is throwing the
 error on?  Thanks.

This was upgrading from 8.4.8 to 9.0.4. I don't have the running cluster
anymore, but I do have tar.gz archives of it and could probably find
2TB free somewhere to restore it to if there is something useful to extract.

However, I don't think this was toast related. Most of our rows are short and 
have only int, float, and short text columns. These errors hit over 60
different tables mostly during the analyzes we ran immediately after the
upgrade. It also hit during select, insert and delete statements. We did not
run the db more than a few minutes as the damage was so extensive.

As far as I can tell pg_upgrade never copied any pg_clog files from the
old cluster to the new cluster. I wish I had detected that before running
the remove_old_cluster.sh script.

-dg

-- 
David Gould   da...@sonic.net  510 536 1443510 282 0869
If simplicity worked, the world would be overrun with insects.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
daveg wrote:
  Can you tell me what table is showing this error?  Does it happen during
  vacuum?  Can you run a vacuum verbose to see what it is throwing the
  error on?  Thanks.
 
 This was upgrading from 8.4.8 to 9.0.4. I don't have the running cluster
 anymore, but I do have tar.gz archives of it and could probably find
 2TB free somewhere to restore it to if there is something useful to extract.
 
 However, I don't think this was toast related. Most of our rows are short and 
 have only int, float, and short text columns. These errors hit over 60
 different tables mostly during the analyzes we ran immediately after the
 upgrade. It also hit during select, insert and delete statements. We did not
 run the db more than a few minutes as the damage was so extensive.
 
 As far as I can tell pg_upgrade never copied any pg_clog files from the
 old cluster to the new cluster. I wish I had detected that before running
 the remove_old_cluster.sh script.

Wow, no clogs?  That would make the system very confused.  You can pull
the clogs out of the old backup and move them over if the files don't
already exist.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread daveg
On Mon, Sep 05, 2011 at 08:19:21PM -0400, Bruce Momjian wrote:
 daveg wrote:
   Can you tell me what table is showing this error?  Does it happen during
   vacuum?  Can you run a vacuum verbose to see what it is throwing the
   error on?  Thanks.
  
  This was upgrading from 8.4.8 to 9.0.4. I don't have the running cluster
  anymore, but I do have tar.gz archives of it and could probably find
  2TB free somewhere to restore it to if there is something useful to extract.
  
  However, I don't think this was toast related. Most of our rows are short 
  and 
  have only int, float, and short text columns. These errors hit over 60
  different tables mostly during the analyzes we ran immediately after the
  upgrade. It also hit during select, insert and delete statements. We did not
  run the db more than a few minutes as the damage was so extensive.
  
  As far as I can tell pg_upgrade never copied any pg_clog files from the
  old cluster to the new cluster. I wish I had detected that before running
  the remove_old_cluster.sh script.
 
 Wow, no clogs?  That would make the system very confused.  You can pull
 the clogs out of the old backup and move them over if the files don't
 already exist.

We don't have the old cluster after running delete_old_cluster.ch. We use
pg_dump for backup, so no clogs.  We ended up restored 20 odd dbs totalling
2.1TB from the previous days pg_dumps.

If you review my original report I mentioned that there were only 2 clog
files in the new cluster both with ctime after the start of postgresql
after the upgrade. I did the upgrade for three hosts at the same time, the
others were fine. They have dozens of clogs dating back days before the
upgrade. The failing system had only 2 recent clog.

-dg
 
 -- 
   Bruce Momjian  br...@momjian.ushttp://momjian.us
   EnterpriseDB http://enterprisedb.com
 
   + It's impossible for everything to be true. +
 

-- 
David Gould   da...@sonic.net  510 536 1443510 282 0869
If simplicity worked, the world would be overrun with insects.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-05 Thread Bruce Momjian
daveg wrote:
   As far as I can tell pg_upgrade never copied any pg_clog files from the
   old cluster to the new cluster. I wish I had detected that before running
   the remove_old_cluster.sh script.
  
  Wow, no clogs?  That would make the system very confused.  You can pull
  the clogs out of the old backup and move them over if the files don't
  already exist.
 
 We don't have the old cluster after running delete_old_cluster.ch. We use
 pg_dump for backup, so no clogs.  We ended up restored 20 odd dbs totalling
 2.1TB from the previous days pg_dumps.
 
 If you review my original report I mentioned that there were only 2 clog
 files in the new cluster both with ctime after the start of postgresql
 after the upgrade. I did the upgrade for three hosts at the same time, the
 others were fine. They have dozens of clogs dating back days before the
 upgrade. The failing system had only 2 recent clog.

That is certainly unusual.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-09-01 Thread hubert depesz lubaczewski
On Wed, Aug 31, 2011 at 09:54:20PM -0400, Bruce Momjian wrote:
 Working with depesz, I have found the cause.  The code I added to fix
 pg_upgrade in 9.0.4 and earlier releases didn't handle old 8.3 servers
 properly.  I mistakenly processed toast table with the same pg_dump
 query as used for pre-8.4 toast tables, not realizing those were not
 functional because there were no reloptions for toast tables in pre-8.4.

Thanks a lot. Will test and post results (around sunday/monday I guess).

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
  
  OK, this was very helpful.  I found out that there is a bug in current
  9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
  tables.  (The bug is not in any released version of pg_upgrade.)  The
  attached, applied patches should fix it for you.  I assume you are
  running 9.0.X, and not 9.0.4.
 
 pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.
 
 will keep you posted.

FYI, this pg_upgrade bug exists in PG 9.1RC1, but not in earlier betas. 
Users can either wait for 9.1 RC2 or Final, or use the patch I posted. 
The bug is not in 9.0.4 and will not be in 9.0.5.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian
Alvaro Herrera wrote:
 Excerpts from hubert depesz lubaczewski's message of lun ago 29 14:49:24 
 -0300 2011:
  On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
   On Fri, Aug 26, 2011 at 05:28:35PM +0200, hubert depesz lubaczewski wrote:
On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
 
 OK, this was very helpful.  I found out that there is a bug in current
 9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
 tables.  (The bug is not in any released version of pg_upgrade.)  The
 attached, applied patches should fix it for you.  I assume you are
 running 9.0.X, and not 9.0.4.

pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.
   
   vacuumdb failed. The fail looks very similar to the one I had on 9.0.4.
   
   After long vacuum I got:
   INFO:  vacuuming pg_toast.pg_toast_106668498
   vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not 
   access status of transaction 3429738606
   DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
 
 I don't understand the pg_upgrade code here.  It is setting the
 datfrozenxid and relfrozenxid values to the latest checkpoint's NextXID,
 
 /* set pg_class.relfrozenxid */
 PQclear(executeQueryOrDie(conn,
   UPDATE   pg_catalog.pg_class 
   SET  relfrozenxid = '%u' 
 /* only heap and TOAST are vacuumed */
   WHERErelkind IN ('r', 't'),
   old_cluster.controldata.chkpnt_nxtxid));
 
 but I don't see why this is safe.  I mean, surely the previous
 vacuum might have been a lot earlier than that.  Are these values reset
 to more correct values (i.e. older ones) later somehow?  My question is,
 why isn't the new cluster completely screwed?

Have you looked at my pg_upgrade presentation?

http://momjian.us/main/presentations/features.html#pg_upgrade

This query happens after we have done a VACUUM FREEEZE on an empty
cluster.

pg_dump --binary-upgrade will dump out the proper relfrozen xids for
every object that gets its file system files copied or linked.

 I wonder if pg_upgrade shouldn't be doing the conservative thing here,
 which AFAICT would be to set all frozenxid values as furthest in the
 past as possible (without causing a shutdown-due-to-wraparound, and
 maybe without causing autovacuum to enter emergency mode either).

I already get complaints about requiring an analyze run after the
upgrade --- this would make it much worse.  In fact I have to look into
upgrading optimizer statistics someday.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Robert Haas
On Wed, Aug 31, 2011 at 12:16 PM, Bruce Momjian br...@momjian.us wrote:
 hubert depesz lubaczewski wrote:
 On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
 
  OK, this was very helpful.  I found out that there is a bug in current
  9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
  tables.  (The bug is not in any released version of pg_upgrade.)  The
  attached, applied patches should fix it for you.  I assume you are
  running 9.0.X, and not 9.0.4.

 pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.

 will keep you posted.

 FYI, this pg_upgrade bug exists in PG 9.1RC1, but not in earlier betas.
 Users can either wait for 9.1 RC2 or Final, or use the patch I posted.
 The bug is not in 9.0.4 and will not be in 9.0.5.

Based on subsequent discussion on this thread, it sounds like
something is still broken.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread hubert depesz lubaczewski
On Wed, Aug 31, 2011 at 12:16:03PM -0400, Bruce Momjian wrote:
 hubert depesz lubaczewski wrote:
  On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
   
   OK, this was very helpful.  I found out that there is a bug in current
   9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
   tables.  (The bug is not in any released version of pg_upgrade.)  The
   attached, applied patches should fix it for you.  I assume you are
   running 9.0.X, and not 9.0.4.
  
  pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.
  
  will keep you posted.
 
 FYI, this pg_upgrade bug exists in PG 9.1RC1, but not in earlier betas. 
 Users can either wait for 9.1 RC2 or Final, or use the patch I posted. 
 The bug is not in 9.0.4 and will not be in 9.0.5.

I assume you mean the bug that caused pg_upgrade to fail.

But there still is (existing in 9.0.4 too) bug which causes vacuum to
fail.

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 INFO:  vacuuming pg_toast.pg_toast_106668498
 vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
 status of transaction 3429738606
 DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
 
 Interestingly.
 
 In old dir there is pg_clog directory with files:
 0AC0 .. 0DAF (including 0CC6, size 262144)
 but new pg_clog has only:
 0D2F .. 0DB0
 
 File content - nearly all files that exist in both places are the same, with 
 exception of 2 newest ones in new datadir:
 3c5122f3e80851735c19522065a2d12a  0DAF
 8651fc2b9fa3d27cfb5b496165cead68  0DB0
 
 0DB0 doesn't exist in old, and 0DAF has different md5sum: 
 7d48996c762d6a10f8eda88ae766c5dd
 
 one more thing. I did select count(*) from transactions and it worked.

Count(*) worked because it didn't access any of the long/toasted values.

 that's about it. I can probably copy over files from old datadir to new (in
 pg_clog/), and will be happy to do it, but I'll wait for your call - retry 
 with
 copies files might destroy some evidence.

You can safely copy over any of the clog files that exist in the old
cluster but not in the new one, but another vacuum is likely to remove
those files again.  :-(

This sure sounds like a variation on the pg_upgrade/toast bug we fixed
in 9.0.4:

http://wiki.postgresql.org/wiki/20110408pg_upgrade_fix

Can you get me the 9.0.X pg_class.relfrozenxid for the toast and heap
tables involved?

FYI, this is what pg_dump --binary-upgrade does to preserve the
relfrozenxids:

-- For binary upgrade, set heap's relfrozenxid
UPDATE pg_catalog.pg_class
SET relfrozenxid = '702'
WHERE oid = 'test'::pg_catalog.regclass;

-- For binary upgrade, set toast's relfrozenxid
UPDATE pg_catalog.pg_class
SET relfrozenxid = '702'
WHERE oid = '16434';

We also preserve the pg_class oids with:

-- For binary upgrade, must preserve pg_class oids
SELECT 
binary_upgrade.set_next_heap_pg_class_oid('16431'::pg_catalog.oid);
SELECT 
binary_upgrade.set_next_toast_pg_class_oid('16434'::pg_catalog.oid);
SELECT 
binary_upgrade.set_next_index_pg_class_oid('16436'::pg_catalog.oid);

The question is whether this is working, and if not, why not?

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Wed, Aug 31, 2011 at 12:16:03PM -0400, Bruce Momjian wrote:
  hubert depesz lubaczewski wrote:
   On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:

OK, this was very helpful.  I found out that there is a bug in current
9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
tables.  (The bug is not in any released version of pg_upgrade.)  The
attached, applied patches should fix it for you.  I assume you are
running 9.0.X, and not 9.0.4.
   
   pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.
   
   will keep you posted.
  
  FYI, this pg_upgrade bug exists in PG 9.1RC1, but not in earlier betas. 
  Users can either wait for 9.1 RC2 or Final, or use the patch I posted. 
  The bug is not in 9.0.4 and will not be in 9.0.5.
 
 I assume you mean the bug that caused pg_upgrade to fail.

Yes.

 But there still is (existing in 9.0.4 too) bug which causes vacuum to
 fail.

Yes.  We need to find the cause of that new bug.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Alvaro Herrera
Excerpts from Bruce Momjian's message of mié ago 31 13:23:07 -0300 2011:
 Alvaro Herrera wrote:
  Excerpts from hubert depesz lubaczewski's message of lun ago 29 14:49:24 
  -0300 2011:
   On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
On Fri, Aug 26, 2011 at 05:28:35PM +0200, hubert depesz lubaczewski 
wrote:
 On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
  
  OK, this was very helpful.  I found out that there is a bug in 
  current
  9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded 
  temp
  tables.  (The bug is not in any released version of pg_upgrade.)  
  The
  attached, applied patches should fix it for you.  I assume you are
  running 9.0.X, and not 9.0.4.
 
 pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.

vacuumdb failed. The fail looks very similar to the one I had on 9.0.4.

After long vacuum I got:
INFO:  vacuuming pg_toast.pg_toast_106668498
vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not 
access status of transaction 3429738606
DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
  
  I don't understand the pg_upgrade code here.  It is setting the
  datfrozenxid and relfrozenxid values to the latest checkpoint's NextXID,
  
  /* set pg_class.relfrozenxid */
  PQclear(executeQueryOrDie(conn,
UPDATE   pg_catalog.pg_class 
SET  relfrozenxid = '%u' 
  /* only heap and TOAST are vacuumed */
WHERErelkind IN ('r', 't'),
old_cluster.controldata.chkpnt_nxtxid));
  
  but I don't see why this is safe.  I mean, surely the previous
  vacuum might have been a lot earlier than that.  Are these values reset
  to more correct values (i.e. older ones) later somehow?  My question is,
  why isn't the new cluster completely screwed?
 
 Have you looked at my pg_upgrade presentation?
 
 http://momjian.us/main/presentations/features.html#pg_upgrade

I just did, but it doesn't explain this in much detail.  (In any case I
don't think we should be relying in a PDF presentation to explain the
inner pg_upgrade details.  I think we should rely more on the
IMPLEMENTATION file rather than your PDF ... amusingly that file doesn't
mention the frozenxids.)

 This query happens after we have done a VACUUM FREEEZE on an empty
 cluster.

Oh, so it only affects the databases that initdb created, right?
The other ones are not even created yet.

 pg_dump --binary-upgrade will dump out the proper relfrozen xids for
 every object that gets its file system files copied or linked.

Okay.  I assume that between the moment you copy the pg_clog files from
the old server, and the moment you do the UPDATEs on pg_class and
pg_database, there is no chance for vacuum to run and remove clog
segments.

Still, it seems to me that this coding makes Min(datfrozenxid) to go
backwards, and that's bad news.

  I wonder if pg_upgrade shouldn't be doing the conservative thing here,
  which AFAICT would be to set all frozenxid values as furthest in the
  past as possible (without causing a shutdown-due-to-wraparound, and
  maybe without causing autovacuum to enter emergency mode either).
 
 I already get complaints about requiring an analyze run after the
 upgrade --- this would make it much worse.  In fact I have to look into
 upgrading optimizer statistics someday.

Why would it make it worse at all?  It doesn't look to me like it
wouldn't affect in any way.  The only thing it does, is tell the system
to keep clog segments around.

-- 
Álvaro Herrera alvhe...@commandprompt.com
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian
Alvaro Herrera wrote:
   I don't understand the pg_upgrade code here.  It is setting the
   datfrozenxid and relfrozenxid values to the latest checkpoint's NextXID,
   
   /* set pg_class.relfrozenxid */
   PQclear(executeQueryOrDie(conn,
 UPDATE   pg_catalog.pg_class 
 SET  relfrozenxid = '%u' 
   /* only heap and TOAST are vacuumed */
 WHERErelkind IN ('r', 't'),
 old_cluster.controldata.chkpnt_nxtxid));
   
   but I don't see why this is safe.  I mean, surely the previous
   vacuum might have been a lot earlier than that.  Are these values reset
   to more correct values (i.e. older ones) later somehow?  My question is,
   why isn't the new cluster completely screwed?
  
  Have you looked at my pg_upgrade presentation?
  
  http://momjian.us/main/presentations/features.html#pg_upgrade
 
 I just did, but it doesn't explain this in much detail.  (In any case I
 don't think we should be relying in a PDF presentation to explain the
 inner pg_upgrade details.  I think we should rely more on the
 IMPLEMENTATION file rather than your PDF ... amusingly that file doesn't
 mention the frozenxids.)
 
  This query happens after we have done a VACUUM FREEEZE on an empty
  cluster.
 
 Oh, so it only affects the databases that initdb created, right?
 The other ones are not even created yet.

Right.

  pg_dump --binary-upgrade will dump out the proper relfrozen xids for
  every object that gets its file system files copied or linked.
 
 Okay.  I assume that between the moment you copy the pg_clog files from
 the old server, and the moment you do the UPDATEs on pg_class and
 pg_database, there is no chance for vacuum to run and remove clog
 segments.

Right, we disable it, and had a long discussion about it.  We actually
start the server with:

-c autovacuum=off -c autovacuum_freeze_max_age=20,

 Still, it seems to me that this coding makes Min(datfrozenxid) to go
 backwards, and that's bad news.

Yes, it is odd, but I don't see another option.  Remember the problem
with xid wrap-around --- we really are defining two different xid eras,
and have to freeze to make that possible.

   I wonder if pg_upgrade shouldn't be doing the conservative thing here,
   which AFAICT would be to set all frozenxid values as furthest in the
   past as possible (without causing a shutdown-due-to-wraparound, and
   maybe without causing autovacuum to enter emergency mode either).
  
  I already get complaints about requiring an analyze run after the
  upgrade --- this would make it much worse.  In fact I have to look into
  upgrading optimizer statistics someday.
 
 Why would it make it worse at all?  It doesn't look to me like it
 wouldn't affect in any way.  The only thing it does, is tell the system
 to keep clog segments around.

It will cause excessive vacuum freezing to happen on startup, I assume.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread hubert depesz lubaczewski
On Wed, Aug 31, 2011 at 01:23:05PM -0400, Bruce Momjian wrote:
 Can you get me the 9.0.X pg_class.relfrozenxid for the toast and heap
 tables involved?

Sure:

=# select oid::regclass, relfrozenxid from pg_class  where relname in 
('transactions', 'pg_toast_106668498');
 oid | relfrozenxid 
-+--
 pg_toast.pg_toast_106668498 |   3673553926
 transactions|   3623560321
(2 rows)

Best regards,

depesz

-- 
The best thing about modern society is how easy it is to avoid contact with it.
 http://depesz.com/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian

FYI, I am working with depesz on IM right now and will report back when
we have a cause of the bug.  FYI, I was without electric power for 53
hours, which is why I am late in replying to this report.

---

daveg wrote:
 On Mon, Aug 29, 2011 at 07:49:24PM +0200, hubert depesz lubaczewski wrote:
  On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
  vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
  status of transaction 3429738606
  DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
  
  Interestingly.
  
  In old dir there is pg_clog directory with files:
  0AC0 .. 0DAF (including 0CC6, size 262144)
  but new pg_clog has only:
  0D2F .. 0DB0
  
  File content - nearly all files that exist in both places are the same, 
  with exception of 2 newest ones in new datadir:
  3c5122f3e80851735c19522065a2d12a  0DAF
  8651fc2b9fa3d27cfb5b496165cead68  0DB0
  
  0DB0 doesn't exist in old, and 0DAF has different md5sum: 
  7d48996c762d6a10f8eda88ae766c5dd
  
  one more thing. I did select count(*) from transactions and it worked.
  
  that's about it. I can probably copy over files from old datadir to new (in
  pg_clog/), and will be happy to do it, but I'll wait for your call - retry 
  with
  copies files might destroy some evidence.
 
 I had this same thing happen this Saturday just past and my client had to
 restore the whole 2+ TB instance from the previous days pg_dumps.
 I had been thinking that perhaps I did something wrong in setting up or
 running the upgrade, but had not found it yet. Now that I see Hubert has
 the same problem it is starting to look like pg_upgrade can eat all your
 data.
 
 After running pg_upgrade apparently successfully and analyzeing all the
 tables we restarted the production workload and started getting errors:
 
 2011-08-27 04:18:34.015  12337  c06  postgres  ERROR:  could not access 
 status of transaction 2923961093
 2011-08-27 04:18:34.015  12337  c06  postgres  DETAIL:  Could not open file 
 pg_clog/0AE4: No such file or directory.
 2011-08-27 04:18:34.015  12337  c06  postgres  STATEMENT:  analyze 
 public.b_pxx;
 
 On examination the pg_clog directory contained on two files timestamped
 after the startup of the new cluster with 9.0.4. Other hosts that upgraded
 successfully had numerous files in pg_clog dating back a few days. So it
 appears that all the clog files went missing during the upgrade somehow.
 a
 This happened upgrading from 8.4.7 to 9.0.4, with a brief session in between
 at 8.4.8. We have upgraded several hosts to 9.0.4 successfully previously.
 
 -dg
 
 -- 
 David Gould   da...@sonic.net  510 536 1443510 282 0869
 If simplicity worked, the world would be overrun with insects.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Wed, Aug 31, 2011 at 01:23:05PM -0400, Bruce Momjian wrote:
  Can you get me the 9.0.X pg_class.relfrozenxid for the toast and heap
  tables involved?
 
 Sure:
 
 =# select oid::regclass, relfrozenxid from pg_class  where relname in 
 ('transactions', 'pg_toast_106668498');
  oid | relfrozenxid 
 -+--
  pg_toast.pg_toast_106668498 |   3673553926
  transactions|   3623560321
 (2 rows)

Working with depesz, I have found the cause.  The code I added to fix
pg_upgrade in 9.0.4 and earlier releases didn't handle old 8.3 servers
properly.  I mistakenly processed toast table with the same pg_dump
query as used for pre-8.4 toast tables, not realizing those were not
functional because there were no reloptions for toast tables in pre-8.4.

The attached applied patches fix all releases.  This will have to be
mentioned in the 9.0.5 release notes, and we should probably do the same
kind of announcement we did when I fixed this for 9.0.4.  :-(

Yeah, I should not have caused this bug.  It did not show up in any of
my testing.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
new file mode 100644
index b00e19b..c5816ae
*** a/src/bin/pg_dump/pg_dump.c
--- b/src/bin/pg_dump/pg_dump.c
*** getTables(int *numTables)
*** 3256,3269 
  		 * owning column, if any (note this dependency is AUTO as of 8.2)
  		 */
  		appendPQExpBuffer(query,
! 		  SELECT c.tableoid, c.oid, relname, 
! 		  relacl, relkind, relnamespace, 
! 		  (%s relowner) AS rolname, 
! 		  relchecks, (reltriggers  0) AS relhastriggers, 
! 		  relhasindex, relhasrules, relhasoids, 
! 		  relfrozenxid, 
! 		  0 AS toid, 
! 		  0 AS tfrozenxid, 
  		  d.refobjid AS owning_tab, 
  		  d.refobjsubid AS owning_col, 
  		  (SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, 
--- 3256,3268 
  		 * owning column, if any (note this dependency is AUTO as of 8.2)
  		 */
  		appendPQExpBuffer(query,
! 		  SELECT c.tableoid, c.oid, c.relname, 
! 		  c.relacl, c.relkind, c.relnamespace, 
! 		  (%s c.relowner) AS rolname, 
! 		  c.relchecks, (c.reltriggers  0) AS relhastriggers, 
! 		  c.relhasindex, c.relhasrules, c.relhasoids, 
! 		  c.relfrozenxid, tc.oid AS toid, 
! 		  tc.relfrozenxid AS tfrozenxid, 
  		  d.refobjid AS owning_tab, 
  		  d.refobjsubid AS owning_col, 
  		  (SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, 
*** getTables(int *numTables)
*** 3275,3281 
  		  d.classid = c.tableoid AND d.objid = c.oid AND 
  		  d.objsubid = 0 AND 
  		  d.refclassid = c.tableoid AND d.deptype = 'a') 
! 		  WHERE relkind in ('%c', '%c', '%c', '%c') 
  		  ORDER BY c.oid,
  		  username_subquery,
  		  RELKIND_SEQUENCE,
--- 3274,3281 
  		  d.classid = c.tableoid AND d.objid = c.oid AND 
  		  d.objsubid = 0 AND 
  		  d.refclassid = c.tableoid AND d.deptype = 'a') 
! 	   LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid) 
! 		  WHERE c.relkind in ('%c', '%c', '%c', '%c') 
  		  ORDER BY c.oid,
  		  username_subquery,
  		  RELKIND_SEQUENCE,
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
new file mode 100644
index d6a547f..b73392b
*** a/src/bin/pg_dump/pg_dump.c
--- b/src/bin/pg_dump/pg_dump.c
*** getTables(int *numTables)
*** 3516,3529 
  		 * owning column, if any (note this dependency is AUTO as of 8.2)
  		 */
  		appendPQExpBuffer(query,
! 		  SELECT c.tableoid, c.oid, relname, 
! 		  relacl, relkind, relnamespace, 
! 		  (%s relowner) AS rolname, 
! 		  relchecks, (reltriggers  0) AS relhastriggers, 
! 		  relhasindex, relhasrules, relhasoids, 
! 		  relfrozenxid, 
! 		  0 AS toid, 
! 		  0 AS tfrozenxid, 
  		  NULL AS reloftype, 
  		  d.refobjid AS owning_tab, 
  		  d.refobjsubid AS owning_col, 
--- 3516,3528 
  		 * owning column, if any (note this dependency is AUTO as of 8.2)
  		 */
  		appendPQExpBuffer(query,
! 		  SELECT c.tableoid, c.oid, c.relname, 
! 		  c.relacl, c.relkind, c.relnamespace, 
! 		  (%s c.relowner) AS rolname, 
! 		  c.relchecks, (c.reltriggers  0) AS relhastriggers, 
! 		  c.relhasindex, c.relhasrules, c.relhasoids, 
! 		  c.relfrozenxid, tc.oid AS toid, 
! 		  tc.relfrozenxid AS tfrozenxid, 
  		  NULL AS reloftype, 
  		  d.refobjid AS owning_tab, 
  		  d.refobjsubid AS owning_col, 
*** getTables(int *numTables)
*** 3536,3542 
  		  d.classid = c.tableoid AND d.objid = c.oid AND 
  		  d.objsubid = 0 AND 
  		  d.refclassid = 

Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-31 Thread Bruce Momjian
daveg wrote:
 On Mon, Aug 29, 2011 at 07:49:24PM +0200, hubert depesz lubaczewski wrote:
  On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
  vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
  status of transaction 3429738606
  DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
  
  Interestingly.
  
  In old dir there is pg_clog directory with files:
  0AC0 .. 0DAF (including 0CC6, size 262144)
  but new pg_clog has only:
  0D2F .. 0DB0
  
  File content - nearly all files that exist in both places are the same, 
  with exception of 2 newest ones in new datadir:
  3c5122f3e80851735c19522065a2d12a  0DAF
  8651fc2b9fa3d27cfb5b496165cead68  0DB0
  
  0DB0 doesn't exist in old, and 0DAF has different md5sum: 
  7d48996c762d6a10f8eda88ae766c5dd
  
  one more thing. I did select count(*) from transactions and it worked.
  
  that's about it. I can probably copy over files from old datadir to new (in
  pg_clog/), and will be happy to do it, but I'll wait for your call - retry 
  with
  copies files might destroy some evidence.
 
 I had this same thing happen this Saturday just past and my client had to
 restore the whole 2+ TB instance from the previous days pg_dumps.
 I had been thinking that perhaps I did something wrong in setting up or
 running the upgrade, but had not found it yet. Now that I see Hubert has
 the same problem it is starting to look like pg_upgrade can eat all your
 data.
 
 After running pg_upgrade apparently successfully and analyzeing all the
 tables we restarted the production workload and started getting errors:
 
 2011-08-27 04:18:34.015  12337  c06  postgres  ERROR:  could not access 
 status of transaction 2923961093
 2011-08-27 04:18:34.015  12337  c06  postgres  DETAIL:  Could not open file 
 pg_clog/0AE4: No such file or directory.
 2011-08-27 04:18:34.015  12337  c06  postgres  STATEMENT:  analyze 
 public.b_pxx;
 
 On examination the pg_clog directory contained on two files timestamped
 after the startup of the new cluster with 9.0.4. Other hosts that upgraded
 successfully had numerous files in pg_clog dating back a few days. So it
 appears that all the clog files went missing during the upgrade somehow.
 a
 This happened upgrading from 8.4.7 to 9.0.4, with a brief session in between
 at 8.4.8. We have upgraded several hosts to 9.0.4 successfully previously.

I have posted this fix to the hackers email list, but I found it only
affected old 8.3 servers, not old 8.4.X, so I am confused by your bug
report.

I have tested 8.4.X to 9.0.4 and found pg_upgrade preserves toast
relfrozenxids properly in that case.

Can you tell me what table is showing this error?  Does it happen during
vacuum?  Can you run a vacuum verbose to see what it is throwing the
error on?  Thanks.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-29 Thread hubert depesz lubaczewski
On Fri, Aug 26, 2011 at 05:28:35PM +0200, hubert depesz lubaczewski wrote:
 On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
  
  OK, this was very helpful.  I found out that there is a bug in current
  9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
  tables.  (The bug is not in any released version of pg_upgrade.)  The
  attached, applied patches should fix it for you.  I assume you are
  running 9.0.X, and not 9.0.4.
 
 pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.

vacuumdb failed. The fail looks very similar to the one I had on 9.0.4.

After long vacuum I got:
INFO:  vacuuming pg_toast.pg_toast_106668498
vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
status of transaction 3429738606
DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.

Unfortunately at the moment, I no longer have the old (8.3) setup, but I do
have the 9.0.X and will be happy to provide any info you might need to help me
debug/fix the problem.

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-29 Thread hubert depesz lubaczewski
On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
 On Fri, Aug 26, 2011 at 05:28:35PM +0200, hubert depesz lubaczewski wrote:
  On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
   
   OK, this was very helpful.  I found out that there is a bug in current
   9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
   tables.  (The bug is not in any released version of pg_upgrade.)  The
   attached, applied patches should fix it for you.  I assume you are
   running 9.0.X, and not 9.0.4.
  
  pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.
 
 vacuumdb failed. The fail looks very similar to the one I had on 9.0.4.
 
 After long vacuum I got:
 INFO:  vacuuming pg_toast.pg_toast_106668498
 vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
 status of transaction 3429738606
 DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
 
 Unfortunately at the moment, I no longer have the old (8.3) setup, but I do
 have the 9.0.X and will be happy to provide any info you might need to help me
 debug/fix the problem.

this pg_toast is related to table transactions, which was vacuumed
like this:

INFO:  vacuuming public.transactions
INFO:  index transaction_id_pkey now contains 50141303 row versions in 144437 
pages
DETAIL:  0 index row versions were removed.
0 index pages have been deleted, 0 are currently reusable.
CPU 1.08s/0.13u sec elapsed 173.04 sec.
INFO:  index transactions_creation_tsz_idx now contains 50141303 row versions 
in 162634 pages
DETAIL:  0 index row versions were removed.
0 index pages have been deleted, 0 are currently reusable.
CPU 1.19s/0.23u sec elapsed 77.45 sec.
INFO:  index fki_transactions_xx_fkey now contains 50141303 row 
versions in 163466 pages
DETAIL:  0 index row versions were removed.
0 index pages have been deleted, 0 are currently reusable.
CPU 1.13s/0.29u sec elapsed 65.45 sec.
INFO:  index fki_transactions__fkey now contains 50141303 row 
versions in 146528 pages
DETAIL:  0 index row versions were removed.
0 index pages have been deleted, 0 are currently reusable.
CPU 1.15s/0.24u sec elapsed 50.28 sec.
INFO:  index fki_transactions_x_fkey now contains 50141303 row 
versions in 190914 pages
DETAIL:  0 index row versions were removed.
5 index pages have been deleted, 0 are currently reusable.
CPU 1.49s/0.17u sec elapsed 67.95 sec.
INFO:  index transactions_xx_id now contains 50141303 
row versions in 164669 pages
DETAIL:  0 index row versions were removed.
2 index pages have been deleted, 0 are currently reusable.
CPU 1.36s/0.18u sec elapsed 62.83 sec.
INFO:  transactions: found 0 removable, 39644831 nonremovable row versions in 
5978240 out of 7312036 pages
DETAIL:  0 dead row versions cannot be removed yet.
There were 8209452 unused item pointers.
0 pages are entirely empty.
CPU 75.75s/18.57u sec elapsed 9268.19 sec.
INFO:  vacuuming pg_toast.pg_toast_106668498
vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
status of transaction 3429738606
DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.

Interestingly.

In old dir there is pg_clog directory with files:
0AC0 .. 0DAF (including 0CC6, size 262144)
but new pg_clog has only:
0D2F .. 0DB0

File content - nearly all files that exist in both places are the same, with 
exception of 2 newest ones in new datadir:
3c5122f3e80851735c19522065a2d12a  0DAF
8651fc2b9fa3d27cfb5b496165cead68  0DB0

0DB0 doesn't exist in old, and 0DAF has different md5sum: 
7d48996c762d6a10f8eda88ae766c5dd

one more thing. I did select count(*) from transactions and it worked.

that's about it. I can probably copy over files from old datadir to new (in
pg_clog/), and will be happy to do it, but I'll wait for your call - retry with
copies files might destroy some evidence.

Best regards,

depesz

-- 
The best thing about modern society is how easy it is to avoid contact with it.
 http://depesz.com/

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-29 Thread Alvaro Herrera
Excerpts from hubert depesz lubaczewski's message of lun ago 29 14:49:24 -0300 
2011:
 On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
  On Fri, Aug 26, 2011 at 05:28:35PM +0200, hubert depesz lubaczewski wrote:
   On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:

OK, this was very helpful.  I found out that there is a bug in current
9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
tables.  (The bug is not in any released version of pg_upgrade.)  The
attached, applied patches should fix it for you.  I assume you are
running 9.0.X, and not 9.0.4.
   
   pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.
  
  vacuumdb failed. The fail looks very similar to the one I had on 9.0.4.
  
  After long vacuum I got:
  INFO:  vacuuming pg_toast.pg_toast_106668498
  vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
  status of transaction 3429738606
  DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.

I don't understand the pg_upgrade code here.  It is setting the
datfrozenxid and relfrozenxid values to the latest checkpoint's NextXID,

/* set pg_class.relfrozenxid */
PQclear(executeQueryOrDie(conn,
  UPDATE   pg_catalog.pg_class 
  SET  relfrozenxid = '%u' 
/* only heap and TOAST are vacuumed */
  WHERErelkind IN ('r', 't'),
  old_cluster.controldata.chkpnt_nxtxid));

but I don't see why this is safe.  I mean, surely the previous
vacuum might have been a lot earlier than that.  Are these values reset
to more correct values (i.e. older ones) later somehow?  My question is,
why isn't the new cluster completely screwed?

I wonder if pg_upgrade shouldn't be doing the conservative thing here,
which AFAICT would be to set all frozenxid values as furthest in the
past as possible (without causing a shutdown-due-to-wraparound, and
maybe without causing autovacuum to enter emergency mode either).

-- 
Álvaro Herrera alvhe...@commandprompt.com
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-29 Thread daveg
On Mon, Aug 29, 2011 at 07:49:24PM +0200, hubert depesz lubaczewski wrote:
 On Mon, Aug 29, 2011 at 06:54:41PM +0200, hubert depesz lubaczewski wrote:
 vacuumdb: vacuuming of database etsy_v2 failed: ERROR:  could not access 
 status of transaction 3429738606
 DETAIL:  Could not open file pg_clog/0CC6: No such file or directory.
 
 Interestingly.
 
 In old dir there is pg_clog directory with files:
 0AC0 .. 0DAF (including 0CC6, size 262144)
 but new pg_clog has only:
 0D2F .. 0DB0
 
 File content - nearly all files that exist in both places are the same, with 
 exception of 2 newest ones in new datadir:
 3c5122f3e80851735c19522065a2d12a  0DAF
 8651fc2b9fa3d27cfb5b496165cead68  0DB0
 
 0DB0 doesn't exist in old, and 0DAF has different md5sum: 
 7d48996c762d6a10f8eda88ae766c5dd
 
 one more thing. I did select count(*) from transactions and it worked.
 
 that's about it. I can probably copy over files from old datadir to new (in
 pg_clog/), and will be happy to do it, but I'll wait for your call - retry 
 with
 copies files might destroy some evidence.

I had this same thing happen this Saturday just past and my client had to
restore the whole 2+ TB instance from the previous days pg_dumps.
I had been thinking that perhaps I did something wrong in setting up or
running the upgrade, but had not found it yet. Now that I see Hubert has
the same problem it is starting to look like pg_upgrade can eat all your
data.

After running pg_upgrade apparently successfully and analyzeing all the
tables we restarted the production workload and started getting errors:

2011-08-27 04:18:34.015  12337  c06  postgres  ERROR:  could not access status 
of transaction 2923961093
2011-08-27 04:18:34.015  12337  c06  postgres  DETAIL:  Could not open file 
pg_clog/0AE4: No such file or directory.
2011-08-27 04:18:34.015  12337  c06  postgres  STATEMENT:  analyze public.b_pxx;

On examination the pg_clog directory contained on two files timestamped
after the startup of the new cluster with 9.0.4. Other hosts that upgraded
successfully had numerous files in pg_clog dating back a few days. So it
appears that all the clog files went missing during the upgrade somehow.
a
This happened upgrading from 8.4.7 to 9.0.4, with a brief session in between
at 8.4.8. We have upgraded several hosts to 9.0.4 successfully previously.

-dg

-- 
David Gould   da...@sonic.net  510 536 1443510 282 0869
If simplicity worked, the world would be overrun with insects.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-26 Thread hubert depesz lubaczewski
On Fri, Aug 26, 2011 at 12:18:55AM -0400, Bruce Momjian wrote:
 
 OK, this was very helpful.  I found out that there is a bug in current
 9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
 tables.  (The bug is not in any released version of pg_upgrade.)  The
 attached, applied patches should fix it for you.  I assume you are
 running 9.0.X, and not 9.0.4.

pg_upgrade worked. Now I'm doing reindex and later on vacuumdb -az.

will keep you posted.

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-25 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 hi
 
 I have 8.3.11 database, ~ 600GB in size.
 
 I want to upgrade it to 9.0.
 
 First, I tried with 9.0.4, and when I hit problem (the same) I tried
 git, head of 9.0 branch.

Good.

 pg_upgrade_dump_db.sql-
 pg_upgrade_dump_db.sql--- For binary upgrade, must preserve relfilenodes
 pg_upgrade_dump_db.sql-SELECT 
 binary_upgrade.set_next_heap_relfilenode('71637068'::pg_catalog.oid);
 pg_upgrade_dump_db.sql:SELECT 
 binary_upgrade.set_next_toast_relfilenode('71637071'::pg_catalog.oid);
 pg_upgrade_dump_db.sql-SELECT 
 binary_upgrade.set_next_index_relfilenode('71637073'::pg_catalog.oid);
 pg_upgrade_dump_db.sql-
 pg_upgrade_dump_db.sql-CREATE TABLE actions (
 --
 pg_upgrade.log-linking /var/postgresql//base/113953649/2613 to 
 /var/postgresql/-9.0/base/11826/11790
 pg_upgrade.log-  /var/postgresql//base/113953649/2683
 pg_upgrade.log-linking /var/postgresql//base/113953649/2683 to 
 /var/postgresql/-9.0/base/11826/11792
 pg_upgrade.log:Could not find 71637071 in old cluster

The problem appears to be that the Postgres catalogs think there is a
toast table for 'actions', while the file system doesn't seem to have
such a file.  I can you look in pg_class and verify that?

SELECT reltoastrelid FROM pg_class WHERE relname  = 'actions';

Then look in the file system to see if there is a matching file.

 One more thing - one of earlier tests actually worked through
 pg_upgrade, but when running vacuumdb -az on newly started 9.0.4, I got
 error about missing transaction/clog - don't remember exactly what it
 was, though.

THere was a bug in how how pg_upgrade worked in pre-9.0.4 --- could it
have been that?

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-25 Thread hubert depesz lubaczewski
On Thu, Aug 25, 2011 at 04:33:07PM -0400, Bruce Momjian wrote:
 The problem appears to be that the Postgres catalogs think there is a
 toast table for 'actions', while the file system doesn't seem to have
 such a file.  I can you look in pg_class and verify that?
 
   SELECT reltoastrelid FROM pg_class WHERE relname  = 'actions';

$ SELECT reltoastrelid FROM pg_class WHERE relname  = 'actions';
 reltoastrelid 
---
(0 rows)

This is done not on the pg from backup, but on normal production, as the test
pg instance doesn't work anymore.

I can re-set the test instance, but extracting from backup, and making it apply
all xlogs usually takes 2-3 days.

  One more thing - one of earlier tests actually worked through
  pg_upgrade, but when running vacuumdb -az on newly started 9.0.4, I got
  error about missing transaction/clog - don't remember exactly what it
  was, though.
 
 THere was a bug in how how pg_upgrade worked in pre-9.0.4 --- could it
 have been that?

It was done definitely using 9.0.4.

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-25 Thread Bruce Momjian
hubert depesz lubaczewski wrote:
 On Thu, Aug 25, 2011 at 04:33:07PM -0400, Bruce Momjian wrote:
  The problem appears to be that the Postgres catalogs think there is a
  toast table for 'actions', while the file system doesn't seem to have
  such a file.  I can you look in pg_class and verify that?
  
  SELECT reltoastrelid FROM pg_class WHERE relname  = 'actions';
 
 $ SELECT reltoastrelid FROM pg_class WHERE relname  = 'actions';
  reltoastrelid 
 ---
 (0 rows)
 
 This is done not on the pg from backup, but on normal production, as the test
 pg instance doesn't work anymore.
 
 I can re-set the test instance, but extracting from backup, and making it 
 apply
 all xlogs usually takes 2-3 days.

If you remove the .old extension on pg_control, you can start the old
cluster and check it.  This is explained by pg_upgrade output:

| If pg_upgrade fails after this point, you must
| re-initdb the new cluster before continuing.
| You will also need to remove the .old suffix
| from /var/postgresql//global/pg_control.old.

Please check the old cluster.

   One more thing - one of earlier tests actually worked through
   pg_upgrade, but when running vacuumdb -az on newly started 9.0.4, I got
   error about missing transaction/clog - don't remember exactly what it
   was, though.
  
  THere was a bug in how how pg_upgrade worked in pre-9.0.4 --- could it
  have been that?
 
 It was done definitely using 9.0.4.

Good.

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-25 Thread hubert depesz lubaczewski
On Thu, Aug 25, 2011 at 04:43:02PM -0400, Bruce Momjian wrote:
 Please check the old cluster.

Sure:

=# SELECT reltoastrelid FROM pg_class WHERE relname  = 'actions';   

  
 reltoastrelid 
---
  82510395
  71637071
(2 rows)

=# SELECT oid::regclass, reltoastrelid FROM pg_class WHERE relname  = 
'actions';  

   
  oid  | reltoastrelid 
---+---
 x.actions |  82510395
 y.actions |  71637071
(2 rows)

=# select oid, relfilenode from pg_class where oid in (SELECT reltoastrelid 
FROM pg_class WHERE relname  = 'actions');
   oid| relfilenode 
--+-
 82510395 |82510395
 71637071 |71637071
(2 rows)

=# select oid from pg_database where datname = current_database();
   oid
--
 71635381
(1 row)

$ ls -l /base/71635381/{71637071,82510395}
-rw--- 1 postgres postgres 0 2009-10-12 06:49 /base/71635381/71637071
-rw--- 1 postgres postgres 0 2010-08-19 14:02 /base/71635381/82510395

One more thing - one of earlier tests actually worked through
pg_upgrade, but when running vacuumdb -az on newly started 9.0.4, I got
error about missing transaction/clog - don't remember exactly what it
was, though.
   THere was a bug in how how pg_upgrade worked in pre-9.0.4 --- could it
   have been that?
  It was done definitely using 9.0.4.
 Good.

Not sure if it's good, since it was after the clog error was fixed, and
I still got it :/

but anyway - the problem with 71637071 is more important now.

Best regards,

depesz


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [GENERAL] pg_upgrade problem

2011-08-25 Thread Bruce Momjian

OK, this was very helpful.  I found out that there is a bug in current
9.0.X, 9.1.X, and HEAD that I introduced recently when I excluded temp
tables.  (The bug is not in any released version of pg_upgrade.)  The
attached, applied patches should fix it for you.  I assume you are
running 9.0.X, and not 9.0.4.

---

hubert depesz lubaczewski wrote:
 On Thu, Aug 25, 2011 at 04:43:02PM -0400, Bruce Momjian wrote:
  Please check the old cluster.
 
 Sure:
 
 =# SELECT reltoastrelid FROM pg_class WHERE relname  = 'actions'; 
   
   
 
  reltoastrelid 
 ---
   82510395
   71637071
 (2 rows)
 
 =# SELECT oid::regclass, reltoastrelid FROM pg_class WHERE relname  = 
 'actions';
   

   oid  | reltoastrelid 
 ---+---
  x.actions |  82510395
  y.actions |  71637071
 (2 rows)
 
 =# select oid, relfilenode from pg_class where oid in (SELECT reltoastrelid 
 FROM pg_class WHERE relname  = 'actions');
oid| relfilenode 
 --+-
  82510395 |82510395
  71637071 |71637071
 (2 rows)
 
 =# select oid from pg_database where datname = current_database();
oid
 --
  71635381
 (1 row)
 
 $ ls -l /base/71635381/{71637071,82510395}
 -rw--- 1 postgres postgres 0 2009-10-12 06:49 /base/71635381/71637071
 -rw--- 1 postgres postgres 0 2010-08-19 14:02 /base/71635381/82510395
 
 One more thing - one of earlier tests actually worked through
 pg_upgrade, but when running vacuumdb -az on newly started 9.0.4, I 
 got
 error about missing transaction/clog - don't remember exactly what it
 was, though.
THere was a bug in how how pg_upgrade worked in pre-9.0.4 --- could it
have been that?
   It was done definitely using 9.0.4.
  Good.
 
 Not sure if it's good, since it was after the clog error was fixed, and
 I still got it :/
 
 but anyway - the problem with 71637071 is more important now.
 
 Best regards,
 
 depesz

-- 
  Bruce Momjian  br...@momjian.ushttp://momjian.us
  EnterpriseDB http://enterprisedb.com

  + It's impossible for everything to be true. +
diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c
new file mode 100644
index ca357e7..1f5b7ae
*** a/contrib/pg_upgrade/info.c
--- b/contrib/pg_upgrade/info.c
*** get_rel_infos(migratorContext *ctx, cons
*** 328,336 
  			 	ON c.reltablespace = t.oid 
  			 WHERE (( 
  			 /* exclude pg_catalog and pg_temp_ (could be orphaned tables) */
! 			 	n.nspname !~ '^pg_' 
! 			 	AND n.nspname != 'information_schema' 
! 			 	AND c.oid = %u 
  			 	) OR ( 
  			 	n.nspname = 'pg_catalog' 
  			 	AND relname IN 
--- 328,338 
  			 	ON c.reltablespace = t.oid 
  			 WHERE (( 
  			 /* exclude pg_catalog and pg_temp_ (could be orphaned tables) */
! 			 n.nspname != 'pg_catalog' 
! 			 AND n.nspname !~ '^pg_temp_' 
! 			 AND n.nspname !~ '^pg_toast_temp_' 
! 			 	  AND n.nspname != 'information_schema' 
! 			 	  AND c.oid = %u 
  			 	) OR ( 
  			 	n.nspname = 'pg_catalog' 
  			 	AND relname IN 
diff --git a/contrib/pg_upgrade/version_old_8_3.c b/contrib/pg_upgrade/version_old_8_3.c
new file mode 100644
index 930f76d..6fcd61b
*** a/contrib/pg_upgrade/version_old_8_3.c
--- b/contrib/pg_upgrade/version_old_8_3.c
*** old_8_3_check_for_name_data_type_usage(m
*** 61,69 
  		NOT a.attisdropped AND 
  		a.atttypid = 'pg_catalog.name'::pg_catalog.regtype AND 
  		c.relnamespace = n.oid AND 
! 			 /* exclude pg_catalog and pg_temp_ (could be orphaned tables) */
!    n.nspname !~ '^pg_' AND 
! 		 		n.nspname != 'information_schema');
  
  		ntups = PQntuples(res);
  		i_nspname = PQfnumber(res, nspname);
--- 61,71 
  		NOT a.attisdropped AND 
  		a.atttypid = 'pg_catalog.name'::pg_catalog.regtype AND 
  		c.relnamespace = n.oid AND 
! /* exclude possibly orphaned temp tables */
! 			 			n.nspname != 'pg_catalog' AND 
! 		n.nspname !~ '^pg_temp_' AND 
! 		n.nspname !~ '^pg_toast_temp_' AND 
! 		n.nspname != 'information_schema' );
  
  		ntups = PQntuples(res);
  		i_nspname = PQfnumber(res, nspname);
*** old_8_3_check_for_tsquery_usage(migrator
*** 152,160 
  		NOT a.attisdropped AND 
  		a.atttypid = 'pg_catalog.tsquery'::pg_catalog.regtype AND 
  		c.relnamespace = n.oid AND 
! 			 /* exclude pg_catalog and pg_temp_ (could be orphaned tables) */
!    n.nspname !~ '^pg_' AND 
!