This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new 6598048 ORC-454. Use Spark 2.4.0 in benchmark
6598048 is described below
commit 6598048187226533ac4139a277f245a4fb5469f9
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Tue Dec 25 21:18:31 2018 -0800
ORC-454. Use Spark 2.4.0 in benchmark
Fixes #353
Signed-off-by: Dongjoon Hyun <[email protected]>
---
java/bench/pom.xml | 12 ++++++------
java/bench/spark/pom.xml | 15 +++++++++++----
.../java/org/apache/orc/bench/spark/SparkBenchmark.java | 3 ++-
3 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/java/bench/pom.xml b/java/bench/pom.xml
index 2dbe44f..2727ac8 100644
--- a/java/bench/pom.xml
+++ b/java/bench/pom.xml
@@ -44,7 +44,7 @@
<orc.version>1.5.2</orc.version>
<parquet.version>1.8.3</parquet.version>
<slf4j.version>1.7.25</slf4j.version>
- <spark.version>2.3.1</spark.version>
+ <spark.version>2.4.0</spark.version>
<storage-api.version>2.6.1</storage-api.version>
<zookeeper.version>3.4.6</zookeeper.version>
</properties>
@@ -58,11 +58,6 @@
<dependencyManagement>
<dependencies>
<dependency>
- <groupId>com.databricks</groupId>
- <artifactId>spark-avro_2.11</artifactId>
- <version>3.2.0</version>
- </dependency>
- <dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.8.4</version>
@@ -439,6 +434,11 @@
</exclusions>
</dependency>
<dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-avro_2.11</artifactId>
+ <version>${spark.version}</version>
+ </dependency>
+ <dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>3.0.8</version>
diff --git a/java/bench/spark/pom.xml b/java/bench/spark/pom.xml
index 90e29a4..ab196d8 100644
--- a/java/bench/spark/pom.xml
+++ b/java/bench/spark/pom.xml
@@ -40,10 +40,6 @@
<dependencies>
<dependency>
- <groupId>com.databricks</groupId>
- <artifactId>spark-avro_2.11</artifactId>
- </dependency>
- <dependency>
<groupId>com.google.auto.service</groupId>
<artifactId>auto-service</artifactId>
</dependency>
@@ -104,6 +100,17 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
+ <!-- Spark 2.4 uses Parquet 1.10.0 -->
+ <dependency>
+ <groupId>org.apache.parquet</groupId>
+ <artifactId>parquet-hadoop</artifactId>
+ <version>1.10.0</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-avro_2.11</artifactId>
+ </dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
diff --git
a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java
b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java
index 87d3277..992b686 100644
--- a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java
+++ b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java
@@ -28,6 +28,7 @@ import org.apache.orc.bench.core.OrcBenchmark;
import org.apache.orc.bench.core.ReadCounters;
import org.apache.orc.bench.core.Utilities;
import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.avro.AvroFileFormat;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.execution.datasources.FileFormat;
import org.apache.spark.sql.execution.datasources.PartitionedFile;
@@ -133,7 +134,7 @@ public class SparkBenchmark implements OrcBenchmark {
}
switch (format) {
case "avro":
- formatObject = new com.databricks.spark.avro.DefaultSource();
+ formatObject = new AvroFileFormat();
break;
case "orc":
formatObject = new OrcFileFormat();