IMPALA-3459: Add test for DROP TABLE PURGE for S3 It was previously thought that PURGE had no effect on S3. However, the Hive Metastore actually created a .Trash directory and copied the files there when a DROP TABLE was conducted from Impala.
This patch just enables the existing PURGE tests for S3. There were a few reasons this wasn't working before. The paths given to the S3 client (boto3) should not have a leading "/". This has been fixed as it doesn't make a difference for HDFS if that exists or not. Also, PURGE is a pure delete whereas a regular DROP is a copy. A copy is consistent whereas a delete is only eventually consistent, so when we PURGE a table or partition, the files will still be visible for sometime after the query has completed. The tests have been modified to accomodate for this case as well. Change-Id: I52d2451e090b00ae2fd9a879c28defa6c940047c Reviewed-on: http://gerrit.cloudera.org:8080/3036 Reviewed-by: Sailesh Mukil <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/7e0cbaf1 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/7e0cbaf1 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/7e0cbaf1 Branch: refs/heads/master Commit: 7e0cbaf1a06da075639b36290b1ec09ef82122e0 Parents: 6910f49 Author: Sailesh Mukil <[email protected]> Authored: Wed May 11 18:22:17 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Thu May 12 23:06:36 2016 -0700 ---------------------------------------------------------------------- tests/common/skip.py | 2 - tests/metadata/test_ddl.py | 94 +++++++++++++++++++++++------------------ 2 files changed, 52 insertions(+), 44 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7e0cbaf1/tests/common/skip.py ---------------------------------------------------------------------- diff --git a/tests/common/skip.py b/tests/common/skip.py index 3c4fe27..b2f52ba 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -34,8 +34,6 @@ class SkipIfS3: jira = partial(pytest.mark.skipif, IS_S3) hdfs_encryption = pytest.mark.skipif(IS_S3, reason="HDFS encryption is not supported with S3") - hdfs_purge = pytest.mark.skipif(IS_S3, - reason="PURGE has no effect on S3") # These ones need test infra work to re-enable. udfs = pytest.mark.skipif(IS_S3, reason="udas/udfs not copied to S3") http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7e0cbaf1/tests/metadata/test_ddl.py ---------------------------------------------------------------------- diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py index 791d68d..0a1900c 100644 --- a/tests/metadata/test_ddl.py +++ b/tests/metadata/test_ddl.py @@ -68,7 +68,6 @@ class TestDdlStatements(ImpalaTestSuite): for dir_ in ['part_data', 't1_tmp1', 't_part_tmp']: self.filesystem_client.delete_file_dir('test-warehouse/%s' % dir_, recursive=True) - @SkipIfS3.hdfs_purge @SkipIfLocal.hdfs_client @pytest.mark.execute_serially def test_drop_table_with_purge(self): @@ -80,41 +79,48 @@ class TestDdlStatements(ImpalaTestSuite): self.client.execute("create table {0}.t1(i int)".format(DDL_PURGE_DB)) self.client.execute("create table {0}.t2(i int)".format(DDL_PURGE_DB)) # Create sample test data files under the table directories - self.hdfs_client.create_file("test-warehouse/{0}.db/t1/t1.txt".format(DDL_PURGE_DB),\ - file_data='t1') - self.hdfs_client.create_file("test-warehouse/{0}.db/t2/t2.txt".format(DDL_PURGE_DB),\ - file_data='t2') + self.filesystem_client.create_file("test-warehouse/{0}.db/t1/t1.txt".\ + format(DDL_PURGE_DB), file_data='t1') + self.filesystem_client.create_file("test-warehouse/{0}.db/t2/t2.txt".\ + format(DDL_PURGE_DB), file_data='t2') # Drop the table (without purge) and make sure it exists in trash self.client.execute("drop table {0}.t1".format(DDL_PURGE_DB)) - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/t1.txt".\ + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/t1.txt".\ format(DDL_PURGE_DB)) - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/".format(DDL_PURGE_DB)) - assert self.hdfs_client.exists(\ - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/t1.txt".\ + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/".\ + format(DDL_PURGE_DB)) + assert self.filesystem_client.exists(\ + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/t1.txt".\ format(getpass.getuser(), DDL_PURGE_DB)) - assert self.hdfs_client.exists(\ - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1".\ + assert self.filesystem_client.exists(\ + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1".\ format(getpass.getuser(), DDL_PURGE_DB)) # Drop the table (with purge) and make sure it doesn't exist in trash self.client.execute("drop table {0}.t2 purge".format(DDL_PURGE_DB)) - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t2/".format(DDL_PURGE_DB)) - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t2/t2.txt".\ - format(DDL_PURGE_DB)) - assert not self.hdfs_client.exists(\ - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t2/t2.txt".\ + if not IS_S3: + # In S3, deletes are eventual. So even though we dropped the table, the files + # belonging to this table will still be visible for some unbounded time. This + # happens only with PURGE. A regular DROP TABLE is just a copy of files which is + # consistent. + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t2/".\ + format(DDL_PURGE_DB)) + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t2/t2.txt".\ + format(DDL_PURGE_DB)) + assert not self.filesystem_client.exists(\ + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t2/t2.txt".\ format(getpass.getuser(), DDL_PURGE_DB)) - assert not self.hdfs_client.exists(\ - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t2".\ + assert not self.filesystem_client.exists(\ + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t2".\ format(getpass.getuser(), DDL_PURGE_DB)) # Create an external table t3 and run the same test as above. Make # sure the data is not deleted - self.hdfs_client.make_dir("test-warehouse/data_t3/", permission=777) - self.hdfs_client.create_file("test-warehouse/data_t3/data.txt", file_data='100') + self.filesystem_client.make_dir("test-warehouse/data_t3/", permission=777) + self.filesystem_client.create_file("test-warehouse/data_t3/data.txt", file_data='100') self.client.execute("create external table {0}.t3(i int) stored as \ textfile location \'/test-warehouse/data_t3\'" .format(DDL_PURGE_DB)) self.client.execute("drop table {0}.t3 purge".format(DDL_PURGE_DB)) - assert self.hdfs_client.exists("test-warehouse/data_t3/data.txt") - self.hdfs_client.delete_file_dir("test-warehouse/data_t3", recursive=True) + assert self.filesystem_client.exists("test-warehouse/data_t3/data.txt") + self.filesystem_client.delete_file_dir("test-warehouse/data_t3", recursive=True) @SkipIfLocal.hdfs_client @pytest.mark.execute_serially @@ -306,7 +312,6 @@ class TestDdlStatements(ImpalaTestSuite): self.run_test_case('QueryTest/alter-table', vector, use_db='alter_table_test_db', multiple_impalad=self._use_multiple_impalad(vector)) - @SkipIfS3.hdfs_purge # S3: missing coverage: alter table drop partition @SkipIfLocal.hdfs_client @pytest.mark.execute_serially def test_drop_partition_with_purge(self, vector): @@ -315,38 +320,43 @@ class TestDdlStatements(ImpalaTestSuite): # Create a sample database alter_purge_db and table t1 in it self._create_db(ALTER_PURGE_DB) self.client.execute("create table {0}.t1(i int) partitioned\ - by (j int)".format(ALTER_PURGE_DB)) + by (j int)".format(ALTER_PURGE_DB)) # Add two partitions (j=1) and (j=2) to table t1 self.client.execute("alter table {0}.t1 add partition(j=1)".format(ALTER_PURGE_DB)) self.client.execute("alter table {0}.t1 add partition(j=2)".format(ALTER_PURGE_DB)) - self.hdfs_client.create_file(\ - "test-warehouse/{0}.db/t1/j=1/j1.txt".format(ALTER_PURGE_DB), file_data='j1') - self.hdfs_client.create_file(\ - "test-warehouse/{0}.db/t1/j=2/j2.txt".format(ALTER_PURGE_DB), file_data='j2') + self.filesystem_client.create_file(\ + "test-warehouse/{0}.db/t1/j=1/j1.txt".format(ALTER_PURGE_DB), file_data='j1') + self.filesystem_client.create_file(\ + "test-warehouse/{0}.db/t1/j=2/j2.txt".format(ALTER_PURGE_DB), file_data='j2') # Drop the partition (j=1) without purge and make sure it exists in trash self.client.execute("alter table {0}.t1 drop partition(j=1)".format(ALTER_PURGE_DB)); - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".\ + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".\ format(ALTER_PURGE_DB)) - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".\ + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=1".\ format(ALTER_PURGE_DB)) - assert self.hdfs_client.exists(\ - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt".\ + assert self.filesystem_client.exists(\ + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt".\ format(getpass.getuser(), ALTER_PURGE_DB)) - assert self.hdfs_client.exists(\ - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1".\ + assert self.filesystem_client.exists(\ + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1".\ format(getpass.getuser(), ALTER_PURGE_DB)) # Drop the partition (with purge) and make sure it doesn't exist in trash self.client.execute("alter table {0}.t1 drop partition(j=2) purge".\ format(ALTER_PURGE_DB)); - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".\ - format(ALTER_PURGE_DB)) - assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".\ - format(ALTER_PURGE_DB)) - assert not self.hdfs_client.exists(\ - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2".\ + if not IS_S3: + # In S3, deletes are eventual. So even though we dropped the partition, the files + # belonging to this partition will still be visible for some unbounded time. This + # happens only with PURGE. A regular DROP TABLE is just a copy of files which is + # consistent. + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".\ + format(ALTER_PURGE_DB)) + assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=2".\ + format(ALTER_PURGE_DB)) + assert not self.filesystem_client.exists(\ + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2".\ format(getpass.getuser(), ALTER_PURGE_DB)) - assert not self.hdfs_client.exists( - "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2/j2.txt".\ + assert not self.filesystem_client.exists( + "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2/j2.txt".\ format(getpass.getuser(), ALTER_PURGE_DB)) @pytest.mark.execute_serially
