This is an automated email from the ASF dual-hosted git repository.
shirshanka pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new 1d32adb [GOBBLIN-968] Honor file split size for HadoopFileInputSource
1d32adb is described below
commit 1d32adbe300ff3121cf2b826990c0616f625a27c
Author: ctodarka <[email protected]>
AuthorDate: Wed Dec 9 19:07:04 2020 -0800
[GOBBLIN-968] Honor file split size for HadoopFileInputSource
Set the value of SPLIT_MAXSIZE and SPLIT_MINSIZE
in FileInputFormat class from the configs
(source.hadoop.file.input.split.minsize and
source.hadoop.file.input.split.maxsize)
Closes #2816 from chiragtodarka/GOBBLIN-968
---
.../gobblin/source/extractor/hadoop/HadoopFileInputSource.java | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git
a/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/hadoop/HadoopFileInputSource.java
b/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/hadoop/HadoopFileInputSource.java
index 98a5c52..34406ac 100644
---
a/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/hadoop/HadoopFileInputSource.java
+++
b/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/hadoop/HadoopFileInputSource.java
@@ -78,6 +78,8 @@ public abstract class HadoopFileInputSource<S, D, K, V>
extends AbstractSource<S
public static final String FILE_SPLITS_DESIRED_KEY =
HADOOP_SOURCE_KEY_PREFIX + "file.splits.desired";
public static final int DEFAULT_FILE_SPLITS_DESIRED = 1;
public static final String FILE_INPUT_PATHS_KEY = HADOOP_SOURCE_KEY_PREFIX +
"file.input.paths";
+ public static final String FILE_INPUT_SPLIT_MINSIZE =
HADOOP_SOURCE_KEY_PREFIX + "file.input.split.minsize";
+ public static final String FILE_INPUT_SPLIT_MAXSIZE =
HADOOP_SOURCE_KEY_PREFIX + "file.input.split.maxsize";
public static final String FILE_INPUT_READ_KEYS_KEY =
HADOOP_SOURCE_KEY_PREFIX + "file.read.keys";
public static final boolean DEFAULT_FILE_INPUT_READ_KEYS = false;
public static final String FILE_SPLIT_PATH_KEY = HADOOP_SOURCE_KEY_PREFIX +
"file.split.path";
@@ -94,6 +96,14 @@ public abstract class HadoopFileInputSource<S, D, K, V>
extends AbstractSource<S
}
}
+ if (state.contains(FILE_INPUT_SPLIT_MINSIZE)) {
+ FileInputFormat.setMinInputSplitSize(job,
state.getPropAsLong(FILE_INPUT_SPLIT_MINSIZE));
+ }
+
+ if (state.contains(FILE_INPUT_SPLIT_MAXSIZE)) {
+ FileInputFormat.setMaxInputSplitSize(job,
state.getPropAsLong(FILE_INPUT_SPLIT_MAXSIZE));
+ }
+
FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state,
job.getConfiguration());
List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
if (fileSplits == null || fileSplits.isEmpty()) {