Repository: incubator-zeppelin Updated Branches: refs/heads/master b12edcd3a -> 69537f141
[ZEPPELIN-395] Support Spark 1.6 Adding support for Spark 1.6 Status: - [x] Spark/Scala - [x] PySpark - [x] Spark SQL - broken - fixed   TODO: - [x] update pom when the artifacts are on central repo - [x] update travis to build 1.6 - [x] update doc (updated readme) Author: Felix Cheung <[email protected]> Closes #463 from felixcheung/spark16 and squashes the following commits: e2f444f [Felix Cheung] push readme update 0809031 [Felix Cheung] reduce test to run for spark 1.5.2 eaf3127 [Felix Cheung] change to final url for spark download, add to travis 52c1d75 [Felix Cheung] fix url for spark hist (this is one that works for now) 97e6b3b [Felix Cheung] 1.6 from maven 30844d7 [Felix Cheung] fix progressing result DataFrame - z.showDF and %sql work now f0c2207 [Felix Cheung] Spark/PySpark working Project: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/commit/69537f14 Tree: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/tree/69537f14 Diff: http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/diff/69537f14 Branch: refs/heads/master Commit: 69537f141236e2db499eadcccaec7b5b5f679c85 Parents: b12edcd Author: Felix Cheung <[email protected]> Authored: Tue Jan 5 23:51:08 2016 -0800 Committer: Lee moon soo <[email protected]> Committed: Thu Jan 7 13:31:23 2016 -0800 ---------------------------------------------------------------------- .travis.yml | 13 +- README.md | 5 +- bin/interpreter.sh | 11 +- spark-dependencies/pom.xml | 240 ++++++++++--------- spark/pom.xml | 74 ++++-- .../apache/zeppelin/spark/SparkInterpreter.java | 3 +- .../org/apache/zeppelin/spark/SparkVersion.java | 3 +- .../apache/zeppelin/spark/ZeppelinContext.java | 22 +- 8 files changed, 220 insertions(+), 151 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/.travis.yml ---------------------------------------------------------------------- diff --git a/.travis.yml b/.travis.yml index 6908594..74f1805 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,17 +22,24 @@ before_install: - "sh -e /etc/init.d/xvfb start" install: - - mvn package -DskipTests -Pspark-1.5 -Phadoop-2.3 -Ppyspark -Pscalding -B + - mvn package -DskipTests -Pspark-1.6 -Phadoop-2.3 -Ppyspark -Pscalding -B before_script: - script: + # spark 1.6 + - mvn package -Pbuild-distr -Pspark-1.6 -Phadoop-2.3 -Ppyspark -Pscalding -B + - ./testing/startSparkCluster.sh 1.6.0 2.3 + - echo "export SPARK_HOME=`pwd`/spark-1.6.0-bin-hadoop2.3" > conf/zeppelin-env.sh + - mvn verify -Pusing-packaged-distr -Pspark-1.6 -Phadoop-2.3 -Ppyspark -Pscalding -B + - ./testing/stopSparkCluster.sh 1.6.0 2.3 # spark 1.5 - - mvn package -Pbuild-distr -Pspark-1.5 -Phadoop-2.3 -Ppyspark -Pscalding -B + - rm -rf `pwd`/interpreter/spark + - mvn package -DskipTests -Pspark-1.5 -Phadoop-2.3 -Ppyspark -B -pl 'zeppelin-interpreter,spark-dependencies,spark' - ./testing/startSparkCluster.sh 1.5.2 2.3 - echo "export SPARK_HOME=`pwd`/spark-1.5.2-bin-hadoop2.3" > conf/zeppelin-env.sh - - mvn verify -Pusing-packaged-distr -Pspark-1.5 -Phadoop-2.3 -Ppyspark -Pscalding -B + - mvn package -Pspark-1.5 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,zeppelin-zengine,zeppelin-server' -Dtest=org.apache.zeppelin.rest.*Test -DfailIfNoTests=false - ./testing/stopSparkCluster.sh 1.5.2 2.3 # spark 1.4 - rm -rf `pwd`/interpreter/spark http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/README.md ---------------------------------------------------------------------- diff --git a/README.md b/README.md index 9a99b78..67ee0b6 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Set spark major version Available profiles are ``` +-Pspark-1.6 -Pspark-1.5 -Pspark-1.4 -Pspark-1.3 @@ -134,13 +135,13 @@ Here're some examples: ``` # basic build -mvn clean package -Pspark-1.5 -Phadoop-2.4 -Pyarn -Ppyspark +mvn clean package -Pspark-1.6 -Phadoop-2.4 -Pyarn -Ppyspark # spark-cassandra integration mvn clean package -Pcassandra-spark-1.5 -Dhadoop.version=2.6.0 -Phadoop-2.6 -DskipTests # with CDH -mvn clean package -Pspark-1.2 -Dhadoop.version=2.5.0-cdh5.3.0 -Phadoop-2.4 -Pvendor-repo -DskipTests +mvn clean package -Pspark-1.5 -Dhadoop.version=2.6.0-cdh5.5.0 -Phadoop-2.6 -Pvendor-repo -DskipTests # with MapR mvn clean package -Pspark-1.5 -Pmapr50 -DskipTests http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/bin/interpreter.sh ---------------------------------------------------------------------- diff --git a/bin/interpreter.sh b/bin/interpreter.sh index ba7f017..b5603c8 100755 --- a/bin/interpreter.sh +++ b/bin/interpreter.sh @@ -81,8 +81,11 @@ if [[ "${INTERPRETER_ID}" == "spark" ]]; then # This will evantually passes SPARK_APP_JAR to classpath of SparkIMain ZEPPELIN_CLASSPATH=${SPARK_APP_JAR} + pattern="$SPARK_HOME/python/lib/py4j-*-src.zip" + py4j=($pattern) + # pick the first match py4j zip - there should only be one export PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH" - export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH" + export PYTHONPATH="${py4j[0]}:$PYTHONPATH" else # add Hadoop jars into classpath if [[ -n "${HADOOP_HOME}" ]]; then @@ -95,7 +98,11 @@ if [[ "${INTERPRETER_ID}" == "spark" ]]; then fi addJarInDir "${INTERPRETER_DIR}/dep" - PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip" + + pattern="${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-*-src.zip" + py4j=($pattern) + # pick the first match py4j zip - there should only be one + PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${py4j[0]}" if [[ -z "${PYTHONPATH}" ]]; then export PYTHONPATH="${PYSPARKPATH}" http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/spark-dependencies/pom.xml ---------------------------------------------------------------------- diff --git a/spark-dependencies/pom.xml b/spark-dependencies/pom.xml index 3af0678..cb8bfc4 100644 --- a/spark-dependencies/pom.xml +++ b/spark-dependencies/pom.xml @@ -33,7 +33,7 @@ <name>Zeppelin: Spark dependencies</name> <description>Zeppelin spark support</description> <url>http://zeppelin.incubator.apache.org</url> - + <properties> <spark.version>1.4.1</spark.version> @@ -130,117 +130,117 @@ </exclusion> </exclusions> </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-api</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-common</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-server-web-proxy</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-yarn-client</artifactId> - <version>${yarn.version}</version> - <exclusions> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - <exclusion> - <groupId>org.jboss.netty</groupId> - <artifactId>netty</artifactId> - </exclusion> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - </exclusions> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-api</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-common</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-server-web-proxy</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-yarn-client</artifactId> + <version>${yarn.version}</version> + <exclusions> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + <exclusion> + <groupId>org.jboss.netty</groupId> + <artifactId>netty</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + </exclusions> </dependency> </dependencies> </dependencyManagement> - - <dependencies> + + <dependencies> <!-- Spark --> <dependency> <groupId>org.apache.spark</groupId> @@ -489,7 +489,7 @@ <dependencies> </dependencies> </profile> - + <profile> <id>cassandra-spark-1.5</id> <properties> @@ -513,7 +513,21 @@ </dependency> </dependencies> </profile> - + + <profile> + <id>spark-1.6</id> + <properties> + <spark.version>1.6.0</spark.version> + <py4j.version>0.9</py4j.version> + <akka.group>com.typesafe.akka</akka.group> + <akka.version>2.3.11</akka.version> + <protobuf.version>2.5.0</protobuf.version> + </properties> + + <dependencies> + </dependencies> + </profile> + <profile> <id>hadoop-0.23</id> <!-- SPARK-1121: Adds an explicit dependency on Avro to work around a @@ -731,10 +745,6 @@ <profile> <id>pyspark</id> - <properties> - <spark.download.url>http://archive.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz - </spark.download.url> - </properties> <build> <plugins> <plugin> http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/spark/pom.xml ---------------------------------------------------------------------- diff --git a/spark/pom.xml b/spark/pom.xml index 174595a..2bc1d6d 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -33,16 +33,70 @@ <name>Zeppelin: Spark</name> <description>Zeppelin spark support</description> <url>http://zeppelin.incubator.apache.org</url> - + <properties> <spark.version>1.4.1</spark.version> <scala.version>2.10.4</scala.version> <scala.binary.version>2.10</scala.binary.version> - <hadoop.version>2.3.0</hadoop.version> <py4j.version>0.8.2.1</py4j.version> </properties> + <profiles> + <profile> + <id>vendor-repo</id> + <repositories> + <repository> + <id>cloudera</id> + <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> + </repository> + </repositories> + </profile> + + <profile> + <id>spark-1.1</id> + <properties> + <spark.version>1.1.1</spark.version> + </properties> + </profile> + + <profile> + <id>spark-1.2</id> + <properties> + <spark.version>1.2.1</spark.version> + </properties> + </profile> + + <profile> + <id>spark-1.3</id> + <properties> + <spark.version>1.3.1</spark.version> + </properties> + </profile> + + <profile> + <id>spark-1.4</id> + <properties> + <spark.version>1.4.1</spark.version> + </properties> + </profile> + + <profile> + <id>spark-1.5</id> + <properties> + <spark.version>1.5.2</spark.version> + </properties> + </profile> + + <profile> + <id>spark-1.6</id> + <properties> + <spark.version>1.6.0</spark.version> + <py4j.version>0.9</py4j.version> + </properties> + </profile> + </profiles> + <dependencies> <dependency> <groupId>org.slf4j</groupId> @@ -72,7 +126,7 @@ <artifactId>guava</artifactId> <version>14.0.1</version> </dependency> - + <!-- Aether :: maven dependency resolution --> <dependency> <groupId>org.apache.maven</groupId> @@ -294,18 +348,6 @@ </dependency> </dependencies> - <profiles> - <profile> - <id>vendor-repo</id> - <repositories> - <repository> - <id>cloudera</id> - <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> - </repository> - </repositories> - </profile> - </profiles> - <build> <plugins> <plugin> @@ -397,7 +439,7 @@ <overWriteReleases>false</overWriteReleases> <overWriteSnapshots>false</overWriteSnapshots> <overWriteIfNewer>true</overWriteIfNewer> - <includeScope>runtime</includeScope> + <includeScope>runtime</includeScope> <artifactItems> <artifactItem> <groupId>${project.groupId}</groupId> http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java ---------------------------------------------------------------------- diff --git a/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java b/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java index 095970f..2f2829e 100644 --- a/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java +++ b/spark/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java @@ -317,7 +317,8 @@ public class SparkInterpreter extends Interpreter { "python" + File.separator + "lib"); } - String[] pythonLibs = new String[]{"pyspark.zip", "py4j-0.8.2.1-src.zip"}; + //Only one of py4j-0.9-src.zip and py4j-0.8.2.1-src.zip should exist + String[] pythonLibs = new String[]{"pyspark.zip", "py4j-0.9-src.zip", "py4j-0.8.2.1-src.zip"}; ArrayList<String> pythonLibUris = new ArrayList<>(); for (String lib : pythonLibs) { File libFile = new File(pysparkPath, lib); http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/spark/src/main/java/org/apache/zeppelin/spark/SparkVersion.java ---------------------------------------------------------------------- diff --git a/spark/src/main/java/org/apache/zeppelin/spark/SparkVersion.java b/spark/src/main/java/org/apache/zeppelin/spark/SparkVersion.java index a362938..eb1c0a2 100644 --- a/spark/src/main/java/org/apache/zeppelin/spark/SparkVersion.java +++ b/spark/src/main/java/org/apache/zeppelin/spark/SparkVersion.java @@ -32,9 +32,10 @@ public class SparkVersion { public static final SparkVersion SPARK_1_4_0 = SparkVersion.fromVersionString("1.4.0"); public static final SparkVersion SPARK_1_5_0 = SparkVersion.fromVersionString("1.5.0"); public static final SparkVersion SPARK_1_6_0 = SparkVersion.fromVersionString("1.6.0"); + public static final SparkVersion SPARK_1_7_0 = SparkVersion.fromVersionString("1.7.0"); public static final SparkVersion MIN_SUPPORTED_VERSION = SPARK_1_0_0; - public static final SparkVersion UNSUPPORTED_FUTURE_VERSION = SPARK_1_6_0; + public static final SparkVersion UNSUPPORTED_FUTURE_VERSION = SPARK_1_7_0; private int version; private String versionString; http://git-wip-us.apache.org/repos/asf/incubator-zeppelin/blob/69537f14/spark/src/main/java/org/apache/zeppelin/spark/ZeppelinContext.java ---------------------------------------------------------------------- diff --git a/spark/src/main/java/org/apache/zeppelin/spark/ZeppelinContext.java b/spark/src/main/java/org/apache/zeppelin/spark/ZeppelinContext.java index 0d2d50c..5ec38d4 100644 --- a/spark/src/main/java/org/apache/zeppelin/spark/ZeppelinContext.java +++ b/spark/src/main/java/org/apache/zeppelin/spark/ZeppelinContext.java @@ -295,30 +295,30 @@ public class ZeppelinContext extends HashMap<String, Object> { try { take = df.getClass().getMethod("take", int.class); rows = (Object[]) take.invoke(df, maxResult + 1); - } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | ClassCastException e) { sc.clearJobGroup(); throw new InterpreterException(e); } - String msg = null; - + List<Attribute> columns = null; // get field names - Method queryExecution; - QueryExecution qe; try { - queryExecution = df.getClass().getMethod("queryExecution"); - qe = (QueryExecution) queryExecution.invoke(df); + // Use reflection because of classname returned by queryExecution changes from + // Spark <1.5.2 org.apache.spark.sql.SQLContext$QueryExecution + // Spark 1.6.0> org.apache.spark.sql.hive.HiveContext$QueryExecution + Object qe = df.getClass().getMethod("queryExecution").invoke(df); + Object a = qe.getClass().getMethod("analyzed").invoke(qe); + scala.collection.Seq seq = (scala.collection.Seq) a.getClass().getMethod("output").invoke(a); + + columns = (List<Attribute>) scala.collection.JavaConverters.seqAsJavaListConverter(seq) + .asJava(); } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) { throw new InterpreterException(e); } - List<Attribute> columns = - scala.collection.JavaConverters.asJavaListConverter( - qe.analyzed().output()).asJava(); - + String msg = null; for (Attribute col : columns) { if (msg == null) { msg = col.name();
