This is an automated email from the ASF dual-hosted git repository. xiangfu pushed a commit to branch pinot-spark in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
commit 996bcc5c28034dc40de766428249d5f30c26d331 Author: Xiang Fu <[email protected]> AuthorDate: Tue Nov 5 17:08:08 2019 -0800 initial commite for pinot spark --- pinot-spark/README.md | 78 +++++++++++++++++++ pinot-spark/pom.xml | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++ pom.xml | 20 +++++ 3 files changed, 300 insertions(+) diff --git a/pinot-spark/README.md b/pinot-spark/README.md new file mode 100644 index 0000000..069cada --- /dev/null +++ b/pinot-spark/README.md @@ -0,0 +1,78 @@ +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> +# Pinot Spark + +Introduction +------------ + +Pinot supports data segment generation from Spark. + + +Build +----- + +To build the project: + +``` +mvn clean install -DskipTests +``` + +This will create a fat jar for pinot spark jar. + +Run +___ + +Create a job properties configuration file, e.g.: + +``` +# Segment creation job configs: +path.to.input=pinot/input/data +path.to.output=pinot/output +path.to.schema=pinot/input/schema/data.schema +segment.table.name=testTable + +# Segment tar push job configs: +push.to.hosts=controller_host_0,controller_host_1 +push.to.port=8888 +``` + +Pinot data schema file needs to be checked in locally and put the schema file in job properties file. + +The `org.apache.pinot.spark.PinotSparkJobLauncher` class (the main class of the shaded JAR in `pinot-spark`) should be run to accomplish this: + +``` +# Segment creation + spark jar pinot-spark-1.0-SNAPSHOT.jar SegmentCreation job.properties + +After this point, we have built the data segment from the raw data file. +Next step is to push those data into pinot controller + +# Segment tar push + spark jar pinot-spark-1.0-SNAPSHOT.jar SegmentTarPush job.properties + +There is also a job that combines the two jobs together. + +# Segment creation and tar push + spark jar pinot-spark-1.0-SNAPSHOT.jar SegmentCreationAndTarPush job.properties +``` + + + diff --git a/pinot-spark/pom.xml b/pinot-spark/pom.xml new file mode 100644 index 0000000..615b21a --- /dev/null +++ b/pinot-spark/pom.xml @@ -0,0 +1,202 @@ +<?xml version="1.0"?> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <artifactId>pinot</artifactId> + <groupId>org.apache.pinot</groupId> + <version>0.2.0-SNAPSHOT</version> + </parent> + <artifactId>pinot-spark</artifactId> + <name>Pinot Spark</name> + <url>https://pinot.apache.org/</url> + <properties> + <pinot.root>${basedir}/..</pinot.root> + </properties> + <profiles> + <profile> + <id>build-shaded-jar</id> + <activation> + <activeByDefault>false</activeByDefault> + </activation> + <build> + <plugins> + <plugin> + <artifactId>maven-shade-plugin</artifactId> + <version>3.2.1</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <!-- + Usually in hadoop environment, there are multiple jars with different versions. + Most of the NoSuchMethodExceptions are caused by class loading conflicts. + Class relocation ensures the reference of certain packages/classes in Pinot code to + shaded libs, e.g. jackson or guava. + Ref: https://maven.apache.org/plugins/maven-shade-plugin/examples/class-relocation.html + --> + <relocations> + <relocation> + <pattern>com.google.common.base</pattern> + <shadedPattern>shaded.com.google.common.base</shadedPattern> + </relocation> + <relocation> + <pattern>com.fasterxml.jackson</pattern> + <shadedPattern>shaded.com.fasterxml.jackson</shadedPattern> + </relocation> + </relocations> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>org.apache.pinot.hadoop.PinotHadoopJobLauncher</mainClass> + </transformer> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + </profiles> + <dependencies> + <dependency> + <groupId>org.apache.pinot</groupId> + <artifactId>pinot-core</artifactId> + <exclusions> + <exclusion> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.pinot</groupId> + <artifactId>pinot-common</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.pinot</groupId> + <artifactId>pinot-common</artifactId> + <version>${project.version}</version> + <classifier>shaded</classifier> + </dependency> + <dependency> + <groupId>org.apache.pinot</groupId> + <artifactId>pinot-parquet</artifactId> + <version>${project.version}</version> + <exclusions> + <exclusion> + <groupId>org.apache.pinot</groupId> + <artifactId>pinot-core</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.pinot</groupId> + <artifactId>pinot-orc</artifactId> + <version>${project.version}</version> + <exclusions> + <exclusion> + <groupId>org.apache.pinot</groupId> + <artifactId>pinot-core</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-core_${scala.binary.version}</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-sql_${scala.binary.version}</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </dependency> + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.avro</groupId> + <artifactId>avro-mapred</artifactId> + <classifier>hadoop2</classifier> + </dependency> + <dependency> + <groupId>org.testng</groupId> + <artifactId>testng</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-core</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>2.5</version> + <configuration> + <forceCreation>true</forceCreation> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.3</version> + <configuration> + <createDependencyReducedPom>false</createDependencyReducedPom> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </filter> + </filters> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-enforcer-plugin</artifactId> + </plugin> + </plugins> + </build> +</project> diff --git a/pom.xml b/pom.xml index 38bee33..e6dc7f1 100644 --- a/pom.xml +++ b/pom.xml @@ -45,6 +45,7 @@ <module>pinot-core</module> <module>pinot-controller</module> <module>pinot-hadoop</module> + <module>pinot-spark</module> <module>pinot-tools</module> <module>pinot-perf</module> <module>pinot-transport</module> @@ -125,6 +126,8 @@ <jersey.version>2.28</jersey.version> <swagger.version>1.5.16</swagger.version> <hadoop.version>2.7.0</hadoop.version> + <spark.version>2.2.0</spark.version> + <scala.binary.version>2.11</scala.binary.version> <antlr.version>4.6</antlr.version> <calcite.version>1.19.0</calcite.version> <!-- commons-configuration, hadoop-common, hadoop-client use commons-lang --> @@ -253,6 +256,11 @@ </dependency> <dependency> <groupId>org.apache.pinot</groupId> + <artifactId>pinot-spark</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.pinot</groupId> <artifactId>pinot-server</artifactId> <version>${project.version}</version> </dependency> @@ -578,6 +586,18 @@ <version>${jackson.version}</version> </dependency> + <!-- Spark --> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-core_${scala.binary.version}</artifactId> + <version>${spark.version}</version> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-sql_${scala.binary.version}</artifactId> + <version>${spark.version}</version> + </dependency> + <!-- Hadoop --> <dependency> <groupId>org.apache.hadoop</groupId> --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
