This is an automated email from the ASF dual-hosted git repository.

ayushsaxena pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 8e9cd066129 HIVE-27926: Iceberg: Allow restricting Iceberg data file 
reads to table location. (#4910). (Ayush Saxena, reviewed by Denys Kuzmenko)
8e9cd066129 is described below

commit 8e9cd066129d65ce174c5533132eb73fb25f41d5
Author: Ayush Saxena <[email protected]>
AuthorDate: Tue Dec 5 10:43:53 2023 +0530

    HIVE-27926: Iceberg: Allow restricting Iceberg data file reads to table 
location. (#4910). (Ayush Saxena, reviewed by Denys Kuzmenko)
---
 .../java/org/apache/hadoop/hive/conf/HiveConf.java |  5 +-
 .../iceberg/mr/hive/HiveIcebergStorageHandler.java |  7 ++
 .../iceberg/mr/mapreduce/IcebergInputFormat.java   | 20 ++++++
 .../mr/hive/TestHiveIcebergRestrictDataFiles.java  | 74 ++++++++++++++++++++++
 .../org/apache/hive/jdbc/TestRestrictedList.java   |  1 +
 5 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index ad807386f36..991c97e250a 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2229,6 +2229,8 @@ public class HiveConf extends Configuration {
 
     HIVE_ICEBERG_MASK_DEFAULT_LOCATION("hive.iceberg.mask.default.location", 
false,
         "If this is set to true the URI for auth will have the default 
location masked with DEFAULT_TABLE_LOCATION"),
+    
HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY("hive.iceberg.allow.datafiles.in.table.location.only",
 false,
+        "If this is set to true, then all the data files being read should be 
withing the table location"),
 
     HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true,
         "If this is set the header for RCFiles will simply be RCF.  If this is 
not\n" +
@@ -5575,7 +5577,8 @@ public class HiveConf extends Configuration {
             "hive.zookeeper.ssl.keystore.type," +
             "hive.zookeeper.ssl.truststore.location," +
             "hive.zookeeper.ssl.truststore.password," +
-            "hive.zookeeper.ssl.truststore.type",
+            "hive.zookeeper.ssl.truststore.type," +
+            "hive.iceberg.allow.datafiles.in.table.location.only",
         "Comma separated list of configuration options which are immutable at 
runtime"),
     HIVE_CONF_HIDDEN_LIST("hive.conf.hidden.list",
         METASTOREPWD.varname + "," + HIVE_SERVER2_SSL_KEYSTORE_PASSWORD.varname
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index 5f4f97b9f72..e3336437c77 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -278,6 +278,13 @@ public class HiveIcebergStorageHandler implements 
HiveStoragePredicateHandler, H
     overlayTableProperties(conf, tableDesc, map);
     // Until the vectorized reader can handle delete files, let's fall back to 
non-vector mode for V2 tables
     fallbackToNonVectorizedModeBasedOnProperties(tableDesc.getProperties());
+
+    boolean allowDataFilesWithinTableLocationOnly =
+        
conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
+            
HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal);
+
+    
map.put(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
+        String.valueOf(allowDataFilesWithinTableLocationOnly));
   }
 
   @Override
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
index dc50a1e3401..3ec1a3b3b7a 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -33,8 +33,11 @@ import java.util.stream.Stream;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.llap.LlapHiveUtils;
 import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.metadata.AuthorizationException;
 import org.apache.hadoop.hive.ql.metadata.HiveUtils;
 import org.apache.hadoop.hive.ql.plan.MapWork;
 import org.apache.hadoop.mapred.JobConf;
@@ -218,6 +221,12 @@ public class IcebergInputFormat<T> extends 
InputFormat<Void, T> {
       scan = applyConfig(conf, createTableScan(table, conf));
     }
 
+    boolean allowDataFilesWithinTableLocationOnly =
+        
conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
+            
HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.defaultBoolVal);
+    Path tableLocation = new Path(conf.get(InputFormatConfig.TABLE_LOCATION));
+
+
     try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) 
{
       tasksIterable.forEach(task -> {
         if (applyResidual && (model == 
InputFormatConfig.InMemoryDataModel.HIVE ||
@@ -225,6 +234,9 @@ public class IcebergInputFormat<T> extends 
InputFormat<Void, T> {
           // TODO: We do not support residual evaluation for HIVE and PIG in 
memory data model yet
           checkResiduals(task);
         }
+        if (allowDataFilesWithinTableLocationOnly) {
+          validateFileLocations(task, tableLocation);
+        }
         splits.add(new IcebergSplit(conf, task));
       });
     } catch (IOException e) {
@@ -241,6 +253,14 @@ public class IcebergInputFormat<T> extends 
InputFormat<Void, T> {
     return splits;
   }
 
+  private static void validateFileLocations(CombinedScanTask split, Path 
tableLocation) {
+    for (FileScanTask fileScanTask : split.files()) {
+      if (!FileUtils.isPathWithinSubtree(new 
Path(fileScanTask.file().path().toString()), tableLocation)) {
+        throw new AuthorizationException("The table contains paths which are 
outside the table location");
+      }
+    }
+  }
+
   private static void checkResiduals(CombinedScanTask task) {
     task.files().forEach(fileScanTask -> {
       Expression residual = fileScanTask.residual();
diff --git 
a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java
 
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java
new file mode 100644
index 00000000000..e9d6950ef46
--- /dev/null
+++ 
b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergRestrictDataFiles.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.mr.hive;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+import org.apache.commons.collections4.ListUtils;
+import org.apache.iceberg.AssertHelpers;
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static 
org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY;
+
+public class TestHiveIcebergRestrictDataFiles extends 
HiveIcebergStorageHandlerWithEngineBase {
+
+  @BeforeClass
+  public static void beforeClass() {
+    shell = HiveIcebergStorageHandlerTestUtils.shell(
+        
Collections.singletonMap(HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY.varname,
 "true"));
+  }
+
+  @Test
+  public void testRestrictDataFiles() throws IOException, InterruptedException 
{
+    TableIdentifier table1 = TableIdentifier.of("default", "tab1");
+    testTables.createTableWithVersions(shell, table1.name(), 
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
+        fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
+
+    AssertHelpers.assertThrows("Should throw exception since there are files 
outside the table directory",
+        IllegalArgumentException.class, "The table contains paths which are 
outside the table location",
+        () -> shell.executeStatement("SELECT * FROM " + table1.name()));
+
+    // Create another table with files within the table location
+    TableIdentifier table2 = TableIdentifier.of("default", "tab2");
+    testTables.createTableWithVersions(shell, table2.name(), 
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
+        fileFormat, null, 0);
+
+    shell.executeStatement(
+        
testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 
table2, false));
+
+    List<Object[]> result = shell.executeStatement("SELECT * FROM " + 
table2.name());
+
+    
HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS,
+        
HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
 result), 0);
+
+    // Insert some more records to generate new Data file
+    shell.executeStatement(
+        
testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1,
 table2, false));
+
+    result = shell.executeStatement("SELECT * FROM " + table2.name());
+
+    
HiveIcebergTestUtils.validateData(ListUtils.union(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS,
+            HiveIcebergStorageHandlerTestUtils.OTHER_CUSTOMER_RECORDS_1),
+        
HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
 result), 0);
+  }
+}
diff --git 
a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java 
b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java
index aeec57757c2..52be546dca8 100644
--- 
a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java
+++ 
b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestRestrictedList.java
@@ -109,6 +109,7 @@ public class TestRestrictedList {
     addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.location");
     addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.password");
     addToExpectedRestrictedMap("hive.zookeeper.ssl.truststore.type");
+    
addToExpectedRestrictedMap("hive.iceberg.allow.datafiles.in.table.location.only");
 
     checkRestrictedListMatch();
   }

Reply via email to