This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 8292e4afdd4b6f5fcfbf2291f97c988c07e1a421 Author: Tamas Mate <[email protected]> AuthorDate: Thu Jan 26 12:32:32 2023 +0100 IMPALA-11864: Iceberg LOAD DATA should not load S3 hidden files Loading data from S3 did not skip hidden files because the FileSystemUtil.listFiles() call was returning a RemoteIterator, which compared to RecursingIterator does not filter the hidden files. This would make a load fail because the hidden files likely have invalid magic string. This commit adds an extra condition to skip hidden files when creating the CREATE subquery. Testing: - Added E2E test - Ran E2E test on S3 build Change-Id: Iffd179383c2bb2529f6f9b5f8bf5cba5f3553652 Reviewed-on: http://gerrit.cloudera.org:8080/19441 Reviewed-by: Daniel Becker <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Noemi Pap-Takacs <[email protected]> Reviewed-by: Zoltan Borok-Nagy <[email protected]> --- .../java/org/apache/impala/analysis/LoadDataStmt.java | 3 ++- .../functional-query/queries/QueryTest/iceberg-load.test | 16 +++++++++++++++- tests/query_test/test_iceberg.py | 8 ++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/LoadDataStmt.java b/fe/src/main/java/org/apache/impala/analysis/LoadDataStmt.java index d23d1951d..479ba9142 100644 --- a/fe/src/main/java/org/apache/impala/analysis/LoadDataStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/LoadDataStmt.java @@ -255,7 +255,8 @@ public class LoadDataStmt extends StatementBase { sourcePath, true, ""); while (fileStatuses.hasNext()) { FileStatus fileStatus = fileStatuses.next(); - if (fileStatus.isFile()) { + String fileName = fileStatus.getPath().getName(); + if (fileStatus.isFile() && !FileSystemUtil.isHiddenFile(fileName)) { filePathForLike = fileStatus.getPath(); break; } diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-load.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-load.test index c5ad08e6a..14f19d77f 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-load.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-load.test @@ -122,7 +122,6 @@ row_regex:.*AnalysisException: Target table .* has fewer columns \(1\) than the ==== ---- QUERY # Test 9: Partitioned Iceberg table ----- QUERY create table test_iceberg_load_partitioned like functional_parquet.iceberg_partitioned stored as iceberg; ==== @@ -139,4 +138,19 @@ select count(*) from test_iceberg_load_partitioned; 1 ---- TYPES BIGINT +==== +---- QUERY +# Test 10: hidden files should be skipped and the one data file should be loaded +load data inpath '/tmp/$DATABASE/hidden/' into table test_iceberg_load_partitioned; +---- RESULTS +'Loaded 1 file(s).' +---- TYPES +STRING +==== +---- QUERY +select count(*) from test_iceberg_load_partitioned; +---- RESULTS +2 +---- TYPES +BIGINT ==== \ No newline at end of file diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py index 9ae75885f..0ed9a6d8f 100644 --- a/tests/query_test/test_iceberg.py +++ b/tests/query_test/test_iceberg.py @@ -885,6 +885,14 @@ class TestIcebergTable(IcebergTestSuite): DST_DIR = "/tmp/" + unique_database + "/partitioned/" self.filesystem_client.make_dir(DST_DIR, permission=777) self.filesystem_client.copy_from_local(SRC_DIR.format(file), DST_DIR) + # Test 10 init: hidden files + DST_DIR = "/tmp/" + unique_database + "/hidden/" + self.filesystem_client.make_dir(DST_DIR, permission=777) + self.filesystem_client.create_file(DST_DIR + "_hidden.1", "Test data 123") + self.filesystem_client.create_file(DST_DIR + "_hidden_2.1", "Test data 123") + self.filesystem_client.create_file(DST_DIR + ".hidden_3", "Test data 123") + self.filesystem_client.create_file(DST_DIR + ".hidden_4.1", "Test data 123") + self.filesystem_client.copy_from_local(SRC_DIR.format(file), DST_DIR) # Init test table create_iceberg_table_from_directory(self.client, unique_database,
