This is an automated email from the ASF dual-hosted git repository.
suvasude pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new ca940de [GOBBLIN-893] Make format-check in ORC-registration optional
and by-default disabled
ca940de is described below
commit ca940de3e659bf77f7bbeeb2fa4b296077441cae
Author: autumnust <[email protected]>
AuthorDate: Wed Oct 2 14:00:40 2019 -0700
[GOBBLIN-893] Make format-check in ORC-registration optional and by-default
disabled
Closes #2748 from autumnust/disableFormatCheck
---
.../java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java | 11 ++++++++++-
.../org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java | 4 ++++
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git
a/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
b/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
index 436420c..7fdceba 100644
---
a/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
+++
b/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
@@ -83,6 +83,9 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
public static final String HIVE_SPEC_SCHEMA_READING_TIMER =
"hiveOrcSerdeManager.schemaReadTimer";
+ public static final String ENABLED_ORC_TYPE_CHECK =
"hiveOrcSerdeManager.enableFormatCheck";
+ public static final boolean DEFAULT_ENABLED_ORC_TYPE_CHECK = false;
+
private static final int EXPECTED_FOOTER_SIZE = 16 * 1024;
private static final String ORC_FORMAT = "ORC";
private static final ByteBuffer MAGIC_BUFFER =
ByteBuffer.wrap(ORC_FORMAT.getBytes(Charsets.UTF_8));
@@ -91,6 +94,7 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
private final HiveSerDeWrapper serDeWrapper;
private final List<String> fileExtensions;
private final List<String> ignoredFilePrefixes;
+ private final boolean checkOrcFormat;
private final MetricContext metricContext;
public HiveOrcSerDeManager(State props)
@@ -102,6 +106,7 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
this.fileExtensions = extensions.isEmpty() ? ImmutableList.of("") :
extensions;
this.ignoredFilePrefixes = props.getPropAsList(IGNORED_FILE_PREFIXES_KEY,
DEFAULT_IGNORED_FILE_PREFIXES);
+ this.checkOrcFormat = props.getPropAsBoolean(ENABLED_ORC_TYPE_CHECK,
DEFAULT_ENABLED_ORC_TYPE_CHECK);
this.metricContext = Instrumented.getMetricContext(props,
HiveOrcSerDeManager.class);
this.serDeWrapper = HiveSerDeWrapper.get(props.getProp(SERDE_TYPE_KEY,
DEFAULT_SERDE_TYPE),
Optional.of(props.getProp(INPUT_FORMAT_CLASS_KEY,
DEFAULT_INPUT_FORMAT_CLASS)),
@@ -177,7 +182,7 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
try {
return ignoredFilePrefixes.stream().noneMatch(e ->
path.getName().startsWith(e))
&& fileExtensions.stream().anyMatch(e ->
path.getName().endsWith(e))
- && isORC(path, fs);
+ && (!checkOrcFormat || isORC(path, fs));
} catch(IOException e) {
log.error("Error checking file for schema retrieval", e);
return false;
@@ -199,6 +204,10 @@ public class HiveOrcSerDeManager extends HiveSerDeManager {
/**
* Determine if a file is ORC format.
* Steal ideas & code from presto/OrcReader under Apache License 2.0.
+ *
+ * Note: This operation is pretty expensive when it comes to checking
magicBytes for each file while listing,
+ * as itself require getFileStatus and open the file. In normal cases,
consider disable it if the confidene level
+ * of format consistency is high enough.
*/
private static boolean isORC(Path file, FileSystem fs)
throws IOException {
diff --git
a/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
b/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
index 4ffa738..782469e 100644
---
a/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
+++
b/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
@@ -40,6 +40,8 @@ import org.apache.gobblin.hive.HiveRegistrationUnit;
import org.apache.gobblin.hive.HiveTable;
import org.apache.gobblin.util.HadoopUtils;
+import static
org.apache.gobblin.hive.orc.HiveOrcSerDeManager.ENABLED_ORC_TYPE_CHECK;
+
@Test(singleThreaded = true)
public class HiveOrcSerDeManagerTest {
@@ -96,6 +98,7 @@ public class HiveOrcSerDeManagerTest {
@Test
public void testEmptyExtension() throws IOException {
State state = new State();
+ state.setProp(ENABLED_ORC_TYPE_CHECK, true);
state.setProp(HiveOrcSerDeManager.FILE_EXTENSIONS_KEY, ",");
HiveOrcSerDeManager manager = new HiveOrcSerDeManager(state);
HiveRegistrationUnit registrationUnit = (new
HiveTable.Builder()).withDbName(TEST_DB).withTableName(TEST_TABLE).build();
@@ -134,6 +137,7 @@ public class HiveOrcSerDeManagerTest {
@Test(expectedExceptions = FileNotFoundException.class,
expectedExceptionsMessageRegExp = "No files in Dataset:orctestdir/register
found for schema retrieval")
public void testNoOrcFiles() throws IOException {
State state = new State();
+ state.setProp(ENABLED_ORC_TYPE_CHECK, true);
state.setProp(HiveOrcSerDeManager.FILE_EXTENSIONS_KEY, ".notOrc");
HiveOrcSerDeManager manager = new HiveOrcSerDeManager(state);
HiveRegistrationUnit registrationUnit = (new
HiveTable.Builder()).withDbName(TEST_DB).withTableName(TEST_TABLE).build();