This is an automated email from the ASF dual-hosted git repository.

suvasude pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git


The following commit(s) were added to refs/heads/master by this push:
     new ca940de  [GOBBLIN-893] Make format-check in ORC-registration optional 
and by-default disabled
ca940de is described below

commit ca940de3e659bf77f7bbeeb2fa4b296077441cae
Author: autumnust <[email protected]>
AuthorDate: Wed Oct 2 14:00:40 2019 -0700

    [GOBBLIN-893] Make format-check in ORC-registration optional and by-default 
disabled
    
    Closes #2748 from autumnust/disableFormatCheck
---
 .../java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java | 11 ++++++++++-
 .../org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java  |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git 
a/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
 
b/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
index 436420c..7fdceba 100644
--- 
a/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
+++ 
b/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
@@ -83,6 +83,9 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
 
   public static final String HIVE_SPEC_SCHEMA_READING_TIMER = 
"hiveOrcSerdeManager.schemaReadTimer";
 
+  public static final String ENABLED_ORC_TYPE_CHECK = 
"hiveOrcSerdeManager.enableFormatCheck";
+  public static final boolean DEFAULT_ENABLED_ORC_TYPE_CHECK = false;
+
   private static final int EXPECTED_FOOTER_SIZE = 16 * 1024;
   private static final String ORC_FORMAT = "ORC";
   private static final ByteBuffer MAGIC_BUFFER = 
ByteBuffer.wrap(ORC_FORMAT.getBytes(Charsets.UTF_8));
@@ -91,6 +94,7 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
   private final HiveSerDeWrapper serDeWrapper;
   private final List<String> fileExtensions;
   private final List<String> ignoredFilePrefixes;
+  private final boolean checkOrcFormat;
   private final MetricContext metricContext;
 
   public HiveOrcSerDeManager(State props)
@@ -102,6 +106,7 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
     this.fileExtensions = extensions.isEmpty() ? ImmutableList.of("") : 
extensions;
 
     this.ignoredFilePrefixes = props.getPropAsList(IGNORED_FILE_PREFIXES_KEY, 
DEFAULT_IGNORED_FILE_PREFIXES);
+    this.checkOrcFormat = props.getPropAsBoolean(ENABLED_ORC_TYPE_CHECK, 
DEFAULT_ENABLED_ORC_TYPE_CHECK);
     this.metricContext = Instrumented.getMetricContext(props, 
HiveOrcSerDeManager.class);
     this.serDeWrapper = HiveSerDeWrapper.get(props.getProp(SERDE_TYPE_KEY, 
DEFAULT_SERDE_TYPE),
         Optional.of(props.getProp(INPUT_FORMAT_CLASS_KEY, 
DEFAULT_INPUT_FORMAT_CLASS)),
@@ -177,7 +182,7 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
           try {
             return ignoredFilePrefixes.stream().noneMatch(e -> 
path.getName().startsWith(e))
                 && fileExtensions.stream().anyMatch(e -> 
path.getName().endsWith(e))
-                && isORC(path, fs);
+                && (!checkOrcFormat || isORC(path, fs));
           } catch(IOException e) {
             log.error("Error checking file for schema retrieval", e);
             return false;
@@ -199,6 +204,10 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
   /**
    * Determine if a file is ORC format.
    * Steal ideas & code from presto/OrcReader under Apache License 2.0.
+   *
+   * Note: This operation is pretty expensive when it comes to checking 
magicBytes for each file while listing,
+   * as itself require getFileStatus and open the file.  In normal cases, 
consider disable it if the confidene level
+   * of format consistency is high enough.
    */
   private static boolean isORC(Path file, FileSystem fs)
       throws IOException {
diff --git 
a/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
 
b/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
index 4ffa738..782469e 100644
--- 
a/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
+++ 
b/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
@@ -40,6 +40,8 @@ import org.apache.gobblin.hive.HiveRegistrationUnit;
 import org.apache.gobblin.hive.HiveTable;
 import org.apache.gobblin.util.HadoopUtils;
 
+import static 
org.apache.gobblin.hive.orc.HiveOrcSerDeManager.ENABLED_ORC_TYPE_CHECK;
+
 
 @Test(singleThreaded = true)
 public class HiveOrcSerDeManagerTest {
@@ -96,6 +98,7 @@ public class HiveOrcSerDeManagerTest {
   @Test
   public void testEmptyExtension() throws IOException {
     State state = new State();
+    state.setProp(ENABLED_ORC_TYPE_CHECK, true);
     state.setProp(HiveOrcSerDeManager.FILE_EXTENSIONS_KEY, ",");
     HiveOrcSerDeManager manager = new HiveOrcSerDeManager(state);
     HiveRegistrationUnit registrationUnit = (new 
HiveTable.Builder()).withDbName(TEST_DB).withTableName(TEST_TABLE).build();
@@ -134,6 +137,7 @@ public class HiveOrcSerDeManagerTest {
   @Test(expectedExceptions = FileNotFoundException.class, 
expectedExceptionsMessageRegExp = "No files in Dataset:orctestdir/register 
found for schema retrieval")
   public void testNoOrcFiles() throws IOException {
     State state = new State();
+    state.setProp(ENABLED_ORC_TYPE_CHECK, true);
     state.setProp(HiveOrcSerDeManager.FILE_EXTENSIONS_KEY, ".notOrc");
     HiveOrcSerDeManager manager = new HiveOrcSerDeManager(state);
     HiveRegistrationUnit registrationUnit = (new 
HiveTable.Builder()).withDbName(TEST_DB).withTableName(TEST_TABLE).build();

Reply via email to