Hi,

I received a question about inconsistent state after crash recovery.

When a table file is broken (or just lost), PostgreSQL can not recover
a whole table, and does not show any notice while recoverying.
I think it means "inconsistent" state.

(1) create a table, and fill records.
(2) process a checkpoint.
(3) fill more records.
(4) force a crash, and delete the table file.
(5) run recovery on restarting.
(6) only records after the checkpoint can be recoverd.

For example, the attached log shows that PostgreSQL can recover
only 1058 records in the table which contains 2000 records
before the crash, and does not tell anything in the server log.

----------------------------------------------------------
insert into t1 values (trim(to_char(generate_series(1,1000),
'00000000000000000000')) );
INSERT 0 1000
select count(*) from t1;
 count
-------
  1000
(1 row)

checkpoint;
CHECKPOINT
insert into t1 values (trim(to_char(generate_series(1001,2000),
'00000000000000000000')) );
INSERT 0 1000
select count(*) from t1;
 count
-------
  2000
(1 row)

(delete the table file)
(kill postgres)
(restart postgres with recovery)

select count(*) from t1;
 count
-------
  1058
(1 row)
----------------------------------------------------------

Is this expected or acceptable?

I think, at least, PostgreSQL should say something about this
situation in the server log, because DBA can not recognize
this situation if no server log exists.

To reproduce it, please check the attached test script.

Any comments?

Regards,
-- 
Satoshi Nagayasu <sn...@uptime.jp>
Uptime Technologies, LLC. http://www.uptime.jp
[snaga@devsv03 postgresql.git]$ sh test_recovery.sh
createdb: database creation failed: ERROR:  database "testdb" already exists
drop table if exists t1;
DROP TABLE
create table t1 ( c1 varchar(20) );
CREATE TABLE
checkpoint;
CHECKPOINT
select count(*) from t1;
 count
-------
     0
(1 row)

select relname,relfilenode from pg_class where relname='t1';
 relname | relfilenode
---------+-------------
 t1      |      147456
(1 row)

select oid,* from pg_database where datname=current_database();
  oid  | datname | datdba | encoding | datcollate | datctype | datistemplate | 
datallowconn | datconnlimit | datlastsysoid | datfrozenxid | datminmxid | 
dattablespace | datacl
-------+---------+--------+----------+------------+----------+---------------+--------------+--------------+---------------+--------------+------------+---------------+--------
 16384 | testdb  |     10 |        0 | C          | C        | f             | 
t            |           -1 |         12895 |         1799 |          1 |       
   1663 |
(1 row)

insert into t1 values (trim(to_char(generate_series(1,1000), 
'00000000000000000000')) );
INSERT 0 1000
select count(*) from t1;
 count
-------
  1000
(1 row)

select pg_relation_size('t1');
 pg_relation_size
------------------
            57344
(1 row)

checkpoint;
CHECKPOINT
insert into t1 values (trim(to_char(generate_series(1001,2000), 
'00000000000000000000')) );
INSERT 0 1000
select count(*) from t1;
 count
-------
  2000
(1 row)

select pg_relation_size('t1');
 pg_relation_size
------------------
           106496
(1 row)

/tmp/pgsql/data/base/16384/147456
waiting for server to start....2013-07-26 13:33:03 JST [@ 14651] LOG:  loaded 
library "pg_stat_statements"
 done
server started
2013-07-26 13:33:03 JST [@ 14653] LOG:  database system was interrupted; last 
known up at 2013-07-26 13:33:03 JST
2013-07-26 13:33:03 JST [@ 14653] LOG:  database system was not properly shut 
down; automatic recovery in progress
2013-07-26 13:33:03 JST [@ 14653] LOG:  redo starts at 0/1D493A0
2013-07-26 13:33:03 JST [@ 14653] LOG:  record with zero length at 0/1D5D958
2013-07-26 13:33:03 JST [@ 14653] LOG:  redo done at 0/1D5D928
2013-07-26 13:33:03 JST [@ 14653] LOG:  last completed transaction was at log 
time 2013-07-26 13:33:03.158161+09
2013-07-26 13:33:03 JST [@ 14651] LOG:  database system is ready to accept 
connections
2013-07-26 13:33:03 JST [@ 14657] LOG:  autovacuum launcher started
select count(*) from t1;
 count
-------
  1058
(1 row)

select pg_relation_size('t1');
 pg_relation_size
------------------
           106496
(1 row)

createdb: database creation failed: ERROR:  database "testdb" already exists
drop table if exists t1;
DROP TABLE
create table t1 ( c1 varchar(20) );
CREATE TABLE
checkpoint;
CHECKPOINT
select count(*) from t1;
 count
-------
     0
(1 row)

select relname,relfilenode from pg_class where relname='t1';
 relname | relfilenode
---------+-------------
 t1      |      155648
(1 row)

select oid,* from pg_database where datname=current_database();
  oid  | datname | datdba | encoding | datcollate | datctype | datistemplate | 
datallowconn | datconnlimit | datlastsysoid | datfrozenxid | datminmxid | 
dattablespace | datacl
-------+---------+--------+----------+------------+----------+---------------+--------------+--------------+---------------+--------------+------------+---------------+--------
 16384 | testdb  |     10 |        0 | C          | C        | f             | 
t            |           -1 |         12895 |         1799 |          1 |       
   1663 |
(1 row)

insert into t1 values (trim(to_char(generate_series(1,1000), 
'00000000000000000000')) );
INSERT 0 1000
select count(*) from t1;
 count
-------
  1000
(1 row)

select pg_relation_size('t1');
 pg_relation_size
------------------
            57344
(1 row)

checkpoint;
CHECKPOINT
insert into t1 values (trim(to_char(generate_series(1001,2000), 
'00000000000000000000')) );
INSERT 0 1000
select count(*) from t1;
 count
-------
  2000
(1 row)

select pg_relation_size('t1');
 pg_relation_size
------------------
           106496
(1 row)

/tmp/pgsql/data/base/16384/155648
waiting for server to start....2013-07-26 13:33:05 JST [@ 14677] LOG:  loaded 
library "pg_stat_statements"
 done
server started
2013-07-26 13:33:05 JST [@ 14679] LOG:  database system was interrupted; last 
known up at 2013-07-26 13:33:04 JST
2013-07-26 13:33:05 JST [@ 14679] LOG:  database system was not properly shut 
down; automatic recovery in progress
2013-07-26 13:33:05 JST [@ 14679] LOG:  redo starts at 0/1D847F0
2013-07-26 13:33:05 JST [@ 14679] LOG:  record with zero length at 0/1D98DA8
2013-07-26 13:33:05 JST [@ 14679] LOG:  redo done at 0/1D98D78
2013-07-26 13:33:05 JST [@ 14679] LOG:  last completed transaction was at log 
time 2013-07-26 13:33:04.936051+09
2013-07-26 13:33:05 JST [@ 14677] LOG:  database system is ready to accept 
connections
2013-07-26 13:33:05 JST [@ 14683] LOG:  autovacuum launcher started
select count(*) from t1;
 count
-------
  1058
(1 row)

select pg_relation_size('t1');
 pg_relation_size
------------------
           106496
(1 row)

[snaga@devsv03 postgresql.git]$
#!/bin/sh

PGHOME=/tmp/pgsql
PGDATA=/tmp/pgsql/data
PATH=${PGHOME}/bin:${PATH}
export PGHOME PGDATA PATH

createdb testdb

psql -e testdb<<EOF
drop table if exists t1;

create table t1 ( c1 varchar(20) );
checkpoint;

select count(*) from t1;

select relname,relfilenode from pg_class where relname='t1';
select oid,* from pg_database where datname=current_database();

insert into t1 values (trim(to_char(generate_series(1,1000), 
'00000000000000000000')) );

-- 1000 records
select count(*) from t1;
select pg_relation_size('t1');

-- write lower blocks in the table file.
checkpoint;

-- write backup blocks and new records in WAL
insert into t1 values (trim(to_char(generate_series(1001,2000), 
'00000000000000000000')) );

-- 2000 records
select count(*) from t1;
select pg_relation_size('t1');
EOF

tablefile=`psql -A -t testdb<<EOF
select
  ( select setting from pg_settings where name='data_directory' ) || '/'  ||
  ( select pg_relation_filepath(oid) from pg_class where relname='t1' )
;
EOF
`

echo $tablefile

# force a process crash, then recovery after that.
killall -9 postgres

# remove the table file.
rm -rf $tablefile

rm -rf ${PGDATA}/*.pid
rm -rf ${PGDATA}/pg_log/*

pg_ctl -w -D ${PGDATA} start

cat ${PGDATA}/pg_log/*

psql -e testdb<<EOF
-- only a backup block and following wal records can be
-- recovered here. so the table is going to be inconsistent.

-- 1000+ (maybe 1058) records, which means inconsistent.
select count(*) from t1;
select pg_relation_size('t1');
EOF

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to