shaofengshi commented on a change in pull request #441: Kylin 3494: Build cube
with spark reports ArrayIndexOutOfBoundsException
URL: https://github.com/apache/kylin/pull/441#discussion_r249680441
##########
File path:
engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkUtil.java
##########
@@ -143,30 +139,7 @@ public static void
modifySparkHadoopConfiguration(SparkContext sc) throws Except
sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.codec",
"org.apache.hadoop.io.compress.DefaultCodec"); // or
org.apache.hadoop.io.compress.SnappyCodec
}
- public static JavaRDD<String[]> hiveRecordInputRDD(boolean isSequenceFile,
JavaSparkContext sc, String inputPath, String hiveTable) throws IOException {
- JavaRDD<String[]> recordRDD;
-
- if (isSequenceFile &&
HadoopUtil.isSequenceDir(sc.hadoopConfiguration(), new Path(inputPath))) {
- recordRDD = getSequenceFormatHiveInput(sc, inputPath);
- } else {
- recordRDD = getOtherFormatHiveInput(sc, hiveTable);
- }
-
- return recordRDD;
- }
-
- private static JavaRDD<String[]>
getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) {
- return sc.sequenceFile(inputPath, BytesWritable.class,
Text.class).values()
- .map(new Function<Text, String[]>() {
- @Override
- public String[] call(Text text) throws Exception {
- String s = Bytes.toString(text.getBytes(), 0,
text.getLength());
- return
s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER);
- }
- });
- }
-
- private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext
sc, String hiveTable) {
+ public static JavaRDD<String[]> getHiveInput(JavaSparkContext sc, String
hiveTable) {
Review comment:
You deleted this logic, so all case will go to hive context to parse, that
will add a dependency on Hive, which is we want to avoid...
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services