Brock Noland created HIVE-9445:
----------------------------------
Summary: Revert HIVE-5700 - schemaTool failure when date partition
has non-date value
Key: HIVE-9445
URL: https://issues.apache.org/jira/browse/HIVE-9445
Project: Hive
Issue Type: Bug
Components: Metastore
Affects Versions: 0.13.1, 0.14.0, 0.13.0, 0.15.0, 0.14.1
Reporter: Brock Noland
Priority: Blocker
HIVE-5700 causes problems with mysql upgrades (see HIVE-8730) and completely
corrupts postgres.
With a postgres metastore on 0.12, I executed the following:
{noformat}
CREATE TABLE HIVE5700_DATE_PARTED (line string) PARTITIONED BY (ddate date);
CREATE TABLE HIVE5700_STRING_PARTED (line string) PARTITIONED BY (ddate string);
ALTER TABLE HIVE5700_DATE_PARTED ADD PARTITION (ddate='NOT_DATE');
ALTER TABLE HIVE5700_DATE_PARTED ADD PARTITION (ddate='20150121');
ALTER TABLE HIVE5700_DATE_PARTED ADD PARTITION (ddate='20150122');
ALTER TABLE HIVE5700_DATE_PARTED ADD PARTITION (ddate='2015-01-23');
ALTER TABLE HIVE5700_STRING_PARTED ADD PARTITION (ddate='NOT_DATE');
ALTER TABLE HIVE5700_STRING_PARTED ADD PARTITION (ddate='20150121');
ALTER TABLE HIVE5700_STRING_PARTED ADD PARTITION (ddate='20150122');
ALTER TABLE HIVE5700_STRING_PARTED ADD PARTITION (ddate='2015-01-23');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_DATE_PARTED PARTITION (ddate='NOT_DATE');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_DATE_PARTED PARTITION (ddate='20150121');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_DATE_PARTED PARTITION (ddate='20150122');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_DATE_PARTED PARTITION (ddate='2015-01-23');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_STRING_PARTED PARTITION (ddate='NOT_DATE');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_STRING_PARTED PARTITION (ddate='20150121');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_STRING_PARTED PARTITION (ddate='20150122');
LOAD DATA LOCAL INPATH '/tmp/single-line-of-data' INTO TABLE
HIVE5700_STRING_PARTED PARTITION (ddate='2015-01-23');
hive> show partitions HIVE5700_DATE_PARTED;
OK
ddate=20150121
ddate=20150122
ddate=2015-01-23
ddate=NOT_DATE
Time taken: 0.052 seconds, Fetched: 4 row(s)
hive> show partitions HIVE5700_STRING_PARTED;
OK
ddate=20150121
ddate=20150122
ddate=2015-01-23
ddate=NOT_DATE
Time taken: 0.051 seconds, Fetched: 4 row(s)
{noformat}
I then took a dump of the database named {{postgres-pre-upgrade.sql}} and the
data in the dump looks good:
{noformat}
[root@hive5700-1-1 ~]# egrep -A9 '^COPY "PARTITIONS"|^COPY
"PARTITION_KEY_VALS"' postgres-pre-upgrade.sql
COPY "PARTITIONS" ("PART_ID", "CREATE_TIME", "LAST_ACCESS_TIME", "PART_NAME",
"SD_ID", "TBL_ID") FROM stdin;
3 1421943647 0 ddate=NOT_DATE 6 2
4 1421943647 0 ddate=20150121 7 2
5 1421943648 0 ddate=20150122 8 2
6 1421943664 0 ddate=NOT_DATE 9 3
7 1421943664 0 ddate=20150121 10 3
8 1421943665 0 ddate=20150122 11 3
9 1421943694 0 ddate=2015-01-23 12 2
10 1421943695 0 ddate=2015-01-23 13 3
\.
--
COPY "PARTITION_KEY_VALS" ("PART_ID", "PART_KEY_VAL", "INTEGER_IDX") FROM stdin;
3 NOT_DATE 0
4 20150121 0
5 20150122 0
6 NOT_DATE 0
7 20150121 0
8 20150122 0
9 2015-01-23 0
10 2015-01-23 0
\.
{noformat}
I then upgraded to 0.13 and subsequently upgraded the MS with the following
command: {{schematool -dbType postgres -upgradeSchema -verbose}}
The file {{postgres-post-upgrade.sql}} is the post-upgrade db dump. As you can
see the data is completely corrupt.
{noformat}
[root@hive5700-1-1 ~]# egrep -A9 '^COPY "PARTITIONS"|^COPY
"PARTITION_KEY_VALS"' postgres-post-upgrade.sql
COPY "PARTITIONS" ("PART_ID", "CREATE_TIME", "LAST_ACCESS_TIME", "PART_NAME",
"SD_ID", "TBL_ID") FROM stdin;
3 1421943647 0 ddate=NOT_DATE 6 2
4 1421943647 0 ddate=20150121 7 2
5 1421943648 0 ddate=20150122 8 2
6 1421943664 0 ddate=NOT_DATE 9 3
7 1421943664 0 ddate=20150121 10 3
8 1421943665 0 ddate=20150122 11 3
9 1421943694 0 ddate=2015-01-23 12 2
10 1421943695 0 ddate=2015-01-23 13 3
\.
--
COPY "PARTITION_KEY_VALS" ("PART_ID", "PART_KEY_VAL", "INTEGER_IDX") FROM stdin;
3 2015-01-23 0
4 2015-01-23 0
5 2015-01-23 0
6 2015-01-23 0
7 2015-01-23 0
8 2015-01-23 0
9 2015-01-23 0
10 2015-01-23 0
\.
{noformat}
Same corruption from the HIVE CLI:
{noformat}
hive> show partitions HIVE5700_DATE_PARTED;
OK
ddate=20150121
ddate=20150122
ddate=2015-01-23
ddate=NOT_DATE
Time taken: 0.844 seconds, Fetched: 4 row(s)
hive> show partitions HIVE5700_STRING_PARTED;
OK
ddate=20150121
ddate=20150122
ddate=2015-01-23
ddate=NOT_DATE
Time taken: 0.114 seconds, Fetched: 4 row(s)
hive> select * from HIVE5700_STRING_PARTED;
OK
some data! 2015-01-23
some data! 2015-01-23
some data! 2015-01-23
some data! 2015-01-23
Time taken: 0.536 seconds, Fetched: 4 row(s)
hive> select * from HIVE5700_DATE_PARTED;
OK
some data! 2015-01-23
some data! 2015-01-23
some data! 2015-01-23
some data! 2015-01-23
Time taken: 0.145 seconds, Fetched: 4 row(s)
{noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)