This is an automated email from the ASF dual-hosted git repository.
ayushsaxena pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 8e9cd066129 HIVE-27926: Iceberg: Allow restricting Iceberg data file
reads to table location. (#4910). (Ayush Saxena, reviewed by Denys Kuzmenko)
8e9cd066129 is described below
commit 8e9cd066129d65ce174c5533132eb73fb25f41d5
Author: Ayush Saxena <[email protected]>
AuthorDate: Tue Dec 5 10:43:53 2023 +0530
HIVE-27926: Iceberg: Allow restricting Iceberg data file reads to table
location. (#4910). (Ayush Saxena, reviewed by Denys Kuzmenko)
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 5 +-
.../iceberg/mr/hive/HiveIcebergStorageHandler.java | 7 ++
.../iceberg/mr/mapreduce/IcebergInputFormat.java | 20 ++++++
.../mr/hive/TestHiveIcebergRestrictDataFiles.java | 74 ++++++++++++++++++++++
.../org/apache/hive/jdbc/TestRestrictedList.java | 1 +
5 files changed, 106 insertions(+), 1 deletion(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index ad807386f36..991c97e250a 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2229,6 +2229,8 @@ public class HiveConf extends Configuration {
HIVE_ICEBERG_MASK_DEFAULT_LOCATION("hive.iceberg.mask.default.location",
false,
"If this is set to true the URI for auth will have the default
location masked with DEFAULT_TABLE_LOCATION"),
+
HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY("hive.iceberg.allow.datafiles.in.table.location.only",
false,
+ "If this is set to true, then all the data files being read should be
withing the table location"),
HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true,
"If this is set the header for RCFiles will simply be RCF. If this is
not\n" +
@@ -5575,7 +5577,8 @@ public class HiveConf extends Configuration {
"hive.zookeeper.ssl.keystore.type," +
"hive.zookeeper.ssl.truststore.location," +
"hive.zookeeper.ssl.truststore.password," +
- "hive.zookeeper.ssl.truststore.type",
+ "hive.zookeeper.ssl.truststore.type," +
+ "hive.iceberg.allow.datafiles.in.table.location.only",
"Comma separated list of configuration options which are immutable at
runtime"),
HIVE_CONF_HIDDEN_LIST("hive.conf.hidden.list",
METASTOREPWD.varname + "," + HIVE_SERVER2_SSL_KEYSTORE_PASSWORD.varname
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index 5f4f97b9f72..e3336437c77 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -278,6 +278,13 @@ public class HiveIcebergStorageHandler implements
HiveStoragePredicateHandler, H
overlayTableProperties(conf, tableDesc, map);
// Until the vectorized reader can handle delete files, let's fall back to
non-vector mode for V2 tables
fallbackToNonVectorizedModeBasedOnProperties(tableDesc.getProperties());
+
+ boolean allowDataFilesWithinTableLocationOnly =
+
conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
+
HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal);
+
+
map.put(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
+ String.valueOf(allowDataFilesWithinTableLocationOnly));
}
@Override
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
index dc50a1e3401..3ec1a3b3b7a 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -33,8 +33,11 @@ import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.llap.LlapHiveUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.metadata.AuthorizationException;
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.mapred.JobConf;
@@ -218,6 +221,12 @@ public class IcebergInputFormat<T> extends
InputFormat<Void, T> {
scan = applyConfig(conf, createTableScan(table, conf));
}
+ boolean allowDataFilesWithinTableLocationOnly =
+
conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
+
HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal);
+ Path tableLocation = new Path(conf.get(InputFormatConfig.TABLE_LOCATION));
+
+
try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks())
{
tasksIterable.forEach(task -> {
if (applyResidual && (model ==
InputFormatConfig.InMemoryDataModel.HIVE ||
@@ -225,6 +234,9 @@ public class IcebergInputFormat<T> extends
InputFormat<Void, T> {
// TODO: We do not support residual evaluation for HIVE and PIG in
memory data model yet
checkResiduals(task);
}
+ if (allowDataFilesWithinTableLocationOnly) {
+ validateFileLocations(task, tableLocation);
+ }
splits.add(new IcebergSplit(conf, task));
});
} catch (IOException e) {
@@ -241,6 +253,14 @@ public class IcebergInputFormat<T> extends
InputFormat<Void, T> {
return splits;
}
+ private static void validateFileLocations(CombinedScanTask split, Path
tableLocation) {
+ for (FileScanTask fileScanTask : split.files()) {
+ if (!FileUtils.isPathWithinSubtree(new
Path(fileScanTask.file().path().toString()), tableLocation)) {
+ throw new AuthorizationException("The table contains paths which are
outside the table location");
+ }
+ }
+ }
+
private static void checkResiduals(CombinedScanTask task) {
task.files().forEach(fileScanTask -> {
Expression residual = fileScanTask.residual();
diff --git
a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java
new file mode 100644
index 00000000000..e9d6950ef46
--- /dev/null
+++
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.mr.hive;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+import org.apache.commons.collections4.ListUtils;
+import org.apache.iceberg.AssertHelpers;
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static
org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY;
+
+public class TestHiveIcebergRestrictDataFiles extends
HiveIcebergStorageHandlerWithEngineBase {
+
+ @BeforeClass
+ public static void beforeClass() {
+ shell = HiveIcebergStorageHandlerTestUtils.shell(
+
Collections.singletonMap(HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
"true"));
+ }
+
+ @Test
+ public void testRestrictDataFiles() throws IOException, InterruptedException
{
+ TableIdentifier table1 = TableIdentifier.of("default", "tab1");
+ testTables.createTableWithVersions(shell, table1.name(),
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
+ fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
+
+ AssertHelpers.assertThrows("Should throw exception since there are files
outside the table directory",
+ IllegalArgumentException.class, "The table contains paths which are
outside the table location",
+ () -> shell.executeStatement("SELECT * FROM " + table1.name()));
+
+ // Create another table with files within the table location
+ TableIdentifier table2 = TableIdentifier.of("default", "tab2");
+ testTables.createTableWithVersions(shell, table2.name(),
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
+ fileFormat, null, 0);
+
+ shell.executeStatement(
+
testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS,
table2, false));
+
+ List<Object[]> result = shell.executeStatement("SELECT * FROM " +
table2.name());
+
+
HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS,
+
HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
result), 0);
+
+ // Insert some more records to generate new Data file
+ shell.executeStatement(
+
testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1,
table2, false));
+
+ result = shell.executeStatement("SELECT * FROM " + table2.name());
+
+
HiveIcebergTestUtils.validateData(ListUtils.union(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS,
+ HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1),
+
HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
result), 0);
+ }
+}
diff --git
a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java
b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java
index aeec57757c2..52be546dca8 100644
---
a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java
+++
b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java
@@ -109,6 +109,7 @@ public class TestRestrictedList {
addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.location");
addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.password");
addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.type");
+
addToExpectedRestrictedMap("hive.iceberg.allow.datafiles.in.table.location.only");
checkRestrictedListMatch();
}