This is an automated email from the ASF dual-hosted git repository.

xiangfu pushed a commit to branch pinot-spark
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git

commit 996bcc5c28034dc40de766428249d5f30c26d331
Author: Xiang Fu <[email protected]>
AuthorDate: Tue Nov 5 17:08:08 2019 -0800

    initial commite for pinot spark
---
 pinot-spark/README.md |  78 +++++++++++++++++++
 pinot-spark/pom.xml   | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++
 pom.xml               |  20 +++++
 3 files changed, 300 insertions(+)

diff --git a/pinot-spark/README.md b/pinot-spark/README.md
new file mode 100644
index 0000000..069cada
--- /dev/null
+++ b/pinot-spark/README.md
@@ -0,0 +1,78 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+# Pinot Spark
+
+Introduction
+------------
+
+Pinot supports data segment generation from Spark.
+
+
+Build
+-----
+
+To build the project:
+
+```
+mvn clean install -DskipTests
+```
+
+This will create a fat jar for pinot spark jar.
+
+Run
+___
+
+Create a job properties configuration file, e.g.:
+
+```
+# Segment creation job configs:
+path.to.input=pinot/input/data
+path.to.output=pinot/output
+path.to.schema=pinot/input/schema/data.schema
+segment.table.name=testTable
+
+# Segment tar push job configs:
+push.to.hosts=controller_host_0,controller_host_1
+push.to.port=8888
+```
+
+Pinot data schema file needs to be checked in locally and put the schema file 
in job properties file.
+
+The `org.apache.pinot.spark.PinotSparkJobLauncher` class (the main class of 
the shaded JAR in `pinot-spark`) should be run to accomplish this:
+
+```
+# Segment creation
+    spark jar  pinot-spark-1.0-SNAPSHOT.jar SegmentCreation job.properties
+  
+After this point, we have built the data segment from the raw data file.
+Next step is to push those data into pinot controller
+
+# Segment tar push
+    spark jar  pinot-spark-1.0-SNAPSHOT.jar SegmentTarPush job.properties
+
+There is also a job that combines the two jobs together.
+
+# Segment creation and tar push
+    spark jar  pinot-spark-1.0-SNAPSHOT.jar SegmentCreationAndTarPush 
job.properties
+```
+
+
+
diff --git a/pinot-spark/pom.xml b/pinot-spark/pom.xml
new file mode 100644
index 0000000..615b21a
--- /dev/null
+++ b/pinot-spark/pom.xml
@@ -0,0 +1,202 @@
+<?xml version="1.0"?>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <artifactId>pinot</artifactId>
+    <groupId>org.apache.pinot</groupId>
+    <version>0.2.0-SNAPSHOT</version>
+  </parent>
+  <artifactId>pinot-spark</artifactId>
+  <name>Pinot Spark</name>
+  <url>https://pinot.apache.org/</url>
+  <properties>
+    <pinot.root>${basedir}/..</pinot.root>
+  </properties>
+  <profiles>
+    <profile>
+      <id>build-shaded-jar</id>
+      <activation>
+        <activeByDefault>false</activeByDefault>
+      </activation>
+      <build>
+        <plugins>
+          <plugin>
+            <artifactId>maven-shade-plugin</artifactId>
+            <version>3.2.1</version>
+            <executions>
+              <execution>
+                <phase>package</phase>
+                <goals>
+                  <goal>shade</goal>
+                </goals>
+                <configuration>
+                  <!--
+                  Usually in hadoop environment, there are multiple jars with 
different versions.
+                  Most of the NoSuchMethodExceptions are caused by class 
loading conflicts.
+                  Class relocation ensures the reference of certain 
packages/classes in Pinot code to
+                  shaded libs, e.g. jackson or guava.
+                  Ref: 
https://maven.apache.org/plugins/maven-shade-plugin/examples/class-relocation.html
+                  -->
+                  <relocations>
+                    <relocation>
+                      <pattern>com.google.common.base</pattern>
+                      
<shadedPattern>shaded.com.google.common.base</shadedPattern>
+                    </relocation>
+                    <relocation>
+                      <pattern>com.fasterxml.jackson</pattern>
+                      
<shadedPattern>shaded.com.fasterxml.jackson</shadedPattern>
+                    </relocation>
+                  </relocations>
+                  <transformers>
+                    <transformer 
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                    <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                      
<mainClass>org.apache.pinot.hadoop.PinotHadoopJobLauncher</mainClass>
+                    </transformer>
+                  </transformers>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+  </profiles>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.pinot</groupId>
+      <artifactId>pinot-core</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>commons-lang</groupId>
+          <artifactId>commons-lang</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.pinot</groupId>
+          <artifactId>pinot-common</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pinot</groupId>
+      <artifactId>pinot-common</artifactId>
+      <version>${project.version}</version>
+      <classifier>shaded</classifier>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pinot</groupId>
+      <artifactId>pinot-parquet</artifactId>
+      <version>${project.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.pinot</groupId>
+          <artifactId>pinot-core</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pinot</groupId>
+      <artifactId>pinot-orc</artifactId>
+      <version>${project.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.pinot</groupId>
+          <artifactId>pinot-core</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>commons-lang</groupId>
+      <artifactId>commons-lang</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-math3</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro-mapred</artifactId>
+      <classifier>hadoop2</classifier>
+    </dependency>
+    <dependency>
+      <groupId>org.testng</groupId>
+      <artifactId>testng</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <version>2.5</version>
+        <configuration>
+          <forceCreation>true</forceCreation>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>2.3</version>
+        <configuration>
+          <createDependencyReducedPom>false</createDependencyReducedPom>
+          <filters>
+            <filter>
+              <artifact>*:*</artifact>
+              <excludes>
+                <exclude>META-INF/*.SF</exclude>
+                <exclude>META-INF/*.DSA</exclude>
+                <exclude>META-INF/*.RSA</exclude>
+              </excludes>
+            </filter>
+          </filters>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-enforcer-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/pom.xml b/pom.xml
index 38bee33..e6dc7f1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -45,6 +45,7 @@
     <module>pinot-core</module>
     <module>pinot-controller</module>
     <module>pinot-hadoop</module>
+    <module>pinot-spark</module>
     <module>pinot-tools</module>
     <module>pinot-perf</module>
     <module>pinot-transport</module>
@@ -125,6 +126,8 @@
     <jersey.version>2.28</jersey.version>
     <swagger.version>1.5.16</swagger.version>
     <hadoop.version>2.7.0</hadoop.version>
+    <spark.version>2.2.0</spark.version>
+    <scala.binary.version>2.11</scala.binary.version>
     <antlr.version>4.6</antlr.version>
     <calcite.version>1.19.0</calcite.version>
     <!-- commons-configuration, hadoop-common, hadoop-client use commons-lang 
-->
@@ -253,6 +256,11 @@
       </dependency>
       <dependency>
         <groupId>org.apache.pinot</groupId>
+        <artifactId>pinot-spark</artifactId>
+        <version>${project.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.pinot</groupId>
         <artifactId>pinot-server</artifactId>
         <version>${project.version}</version>
       </dependency>
@@ -578,6 +586,18 @@
         <version>${jackson.version}</version>
       </dependency>
 
+      <!-- Spark  -->
+      <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-core_${scala.binary.version}</artifactId>
+        <version>${spark.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-sql_${scala.binary.version}</artifactId>
+        <version>${spark.version}</version>
+      </dependency>
+
       <!-- Hadoop  -->
       <dependency>
         <groupId>org.apache.hadoop</groupId>


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to