This is an automated email from the ASF dual-hosted git repository. kenn pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/beam-site.git
commit fa0eec805f728fb194569715aa67d6d715605496 Author: xiliu <[email protected]> AuthorDate: Mon Jun 4 10:28:09 2018 -0700 [BEAM-3079]: Samza Runner docs and capability matrix --- src/_data/capability-matrix.yml | 118 ++++++++++++++++++++++++++- src/documentation/index.md | 1 + src/documentation/runners/samza.md | 149 +++++++++++++++++++++++++++++++++++ src/get-started/beam-overview.md | 1 + src/get-started/quickstart-java.md | 23 ++++++ src/get-started/wordcount-example.md | 53 +++++++++++++ src/images/logo_samza.png | Bin 0 -> 11728 bytes src/images/logos/runners/samza.png | Bin 0 -> 11728 bytes src/index.md | 3 + 9 files changed, 346 insertions(+), 2 deletions(-) diff --git a/src/_data/capability-matrix.yml b/src/_data/capability-matrix.yml index acac0ad..508ac1f 100644 --- a/src/_data/capability-matrix.yml +++ b/src/_data/capability-matrix.yml @@ -17,6 +17,8 @@ columns: name: JStorm - class: ibmstreams name: IBM Streams + - class: samza + name: Apache Samza categories: - description: What is being computed? @@ -64,6 +66,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: Supported with per-element transformation. - name: GroupByKey values: - class: model @@ -102,6 +108,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: "Uses Samza's partitionBy for key grouping and Beam's logic for window aggregation and triggering." - name: Flatten values: - class: model @@ -140,6 +150,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Combine values: - class: model @@ -178,6 +192,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: Use combiner for efficient pre-aggregation. - name: Composite Transforms values: - class: model @@ -216,6 +234,10 @@ categories: l1: 'Partially' l2: supported via inlining l3: '' + - class: samza + l1: 'Partially' + l2: supported via inlining + l3: '' - name: Side Inputs values: - class: model @@ -254,6 +276,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: Uses Samza's broadcast operator to distribute the side inputs. - name: Source API values: - class: model @@ -292,6 +318,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Splittable DoFn values: - class: model @@ -329,7 +359,11 @@ categories: - class: ibmstreams l1: 'No' l2: not implemented - l3: + l3: + - class: samza + l1: 'No' + l2: not implemented + l3: - name: Metrics values: - class: model @@ -368,6 +402,10 @@ categories: l1: 'Partially' l2: All metrics types are supported. l3: Only attempted values are supported. No committed values for metrics. + - class: samza + l1: 'Partially' + l2: Counter and Gauge are supported. + l3: Only attempted values are supported. No committed values for metrics. - name: Stateful Processing values: - class: model @@ -406,6 +444,10 @@ categories: l1: 'Partially' l2: non-merging windows l3: '' + - class: samza + l1: 'Partially' + l2: non-merging windows + l3: 'States are backed up by either rocksDb KV store or in-memory hash map, and persist using changelog.' - description: Where in event time? anchor: where color-b: '37d' @@ -451,6 +493,10 @@ categories: l1: 'Yes' l2: supported l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' - name: Fixed windows values: - class: model @@ -489,6 +535,10 @@ categories: l1: 'Yes' l2: supported l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' - name: Sliding windows values: - class: model @@ -527,6 +577,10 @@ categories: l1: 'Yes' l2: supported l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' - name: Session windows values: - class: model @@ -565,6 +619,10 @@ categories: l1: 'Yes' l2: supported l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' - name: Custom windows values: - class: model @@ -603,6 +661,10 @@ categories: l1: 'Yes' l2: supported l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' - name: Custom merging windows values: - class: model @@ -641,6 +703,10 @@ categories: l1: 'Yes' l2: supported l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' - name: Timestamp control values: - class: model @@ -679,6 +745,10 @@ categories: l1: 'Yes' l2: supported l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' - description: When in processing time? anchor: when @@ -726,6 +796,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Event-time triggers values: @@ -765,6 +839,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Processing-time triggers values: @@ -804,6 +882,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Count triggers values: @@ -843,6 +925,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: '[Meta]data driven triggers' values: @@ -882,7 +968,11 @@ categories: - class: ibmstreams l1: 'No' l2: pending model support - l3: + l3: + - class: samza + l1: 'No' + l2: pending model support + l3: - name: Composite triggers values: @@ -922,6 +1012,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Allowed lateness values: @@ -961,6 +1055,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Timers values: @@ -1000,6 +1098,10 @@ categories: l1: 'Partially' l2: non-merging windows l3: '' + - class: samza + l1: 'No' + l2: '' + l3: '' - description: How do refinements relate? anchor: how @@ -1047,6 +1149,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: Accumulating values: @@ -1086,6 +1192,10 @@ categories: l1: 'Yes' l2: fully supported l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' - name: 'Accumulating & Retracting' values: @@ -1126,3 +1236,7 @@ categories: l1: 'No' l2: pending model support l3: '' + - class: samza + l1: 'No' + l2: pending model support + l3: '' \ No newline at end of file diff --git a/src/documentation/index.md b/src/documentation/index.md index e7b36cc..8c008f1 100644 --- a/src/documentation/index.md +++ b/src/documentation/index.md @@ -46,6 +46,7 @@ A Beam Runner runs a Beam pipeline on a specific (often distributed) data proces * [SparkRunner]({{ site.baseurl }}/documentation/runners/spark/): Runs on [Apache Spark](http://spark.apache.org). * [DataflowRunner]({{ site.baseurl }}/documentation/runners/dataflow/): Runs on [Google Cloud Dataflow](https://cloud.google.com/dataflow), a fully managed service within [Google Cloud Platform](https://cloud.google.com/). * [GearpumpRunner]({{ site.baseurl }}/documentation/runners/gearpump/): Runs on [Apache Gearpump (incubating)](http://gearpump.apache.org). +* [SamzaRunner]({{ site.baseurl }}/documentation/runners/samza/): Runs on [Apache Samza](http://samza.apache.org). ### Choosing a Runner diff --git a/src/documentation/runners/samza.md b/src/documentation/runners/samza.md new file mode 100644 index 0000000..fe48576 --- /dev/null +++ b/src/documentation/runners/samza.md @@ -0,0 +1,149 @@ +--- +layout: section +title: "Apache Samza Runner" +section_menu: section-menu/runners.html +permalink: /documentation/runners/samza/ +redirect_from: /learn/runners/Samza/ +--- +# Using the Apache Samza Runner + +The Apache Samza Runner can be used to execute Beam pipelines using [Apache Samza](http://samza.apache.org/). The Samza Runner executes Beam pipeline in a Samza application and can run locally. The application can further be built into a .tgz file, and deployed to a YARN cluster or Samza standalone cluster with Zookeeper. + +The Samza Runner and Samza are suitable for large scale, stateful streaming jobs, and provide: + +* First class support for local state (with RocksDB store). This allows fast state access for high frequency streaming jobs. +* Fault-tolerance with support for incremental checkpointing of state instead of full snapshots. This enables Samza to scale to applications with very large state. +* A fully asynchronous processing engine that makes remote calls efficient. +* Flexible deployment model for running the the applications in any hosting environment with Zookeeper. +* Features like canaries, upgrades and rollbacks that support extremely large deployments with minimal downtime. + +The [Beam Capability Matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) documents the currently supported capabilities of the Samza Runner. + +## Samza Runner prerequisites and setup + +The Samza Runner is built on Samza version greater than 0.14.1, and uses Scala version 2.11. + +### Specify your dependency + +<span class="language-java">You can specify your dependency on the Samza Runner by adding the following to your `pom.xml`:</span> +```java +<dependency> + <groupId>org.apache.beam</groupId> + <artifactId>beam-runners-samza_2.11</artifactId> + <version>{{ site.release_latest }}</version> + <scope>runtime</scope> +</dependency> + +<!-- Samza dependencies --> +<dependency> + <groupId>org.apache.samza</groupId> + <artifactId>samza-api</artifactId> + <version>${samza.version}</version> +</dependency> + +<dependency> + <groupId>org.apache.samza</groupId> + <artifactId>samza-core_2.11</artifactId> + <version>${samza.version}</version> +</dependency> + +<dependency> + <groupId>org.apache.samza</groupId> + <artifactId>samza-kafka_2.11</artifactId> + <version>${samza.version}</version> + <scope>runtime</scope> +</dependency> + +<dependency> + <groupId>org.apache.samza</groupId> + <artifactId>samza-kv_2.11</artifactId> + <version>${samza.version}</version> + <scope>runtime</scope> +</dependency> + +<dependency> + <groupId>org.apache.samza</groupId> + <artifactId>samza-kv-rocksdb_2.11</artifactId> + <version>${samza.version}</version> + <scope>runtime</scope> +</dependency> + +``` + +## Executing a pipeline with Samza Runner + +If you run your pipeline locally or deploy it to a standalone cluster bundled with all the jars and resource files, no packaging is required. For example, the following command runs the WordCount example: + +``` +$ mvn exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ + -Psamza-runner \ + -Dexec.args="--runner=SamzaRunner \ + --inputFile=/path/to/input \ + --output=/path/to/counts" +``` + +To deploy your pipeline to a YARN cluster, you need to package your application jars and resource files into a `.tgz` archive file. In your config, you need to specify the URI of the TGZ file for Samza Runner to download: + +``` +yarn.package.path=${your_job_tgz_URI} + +job.name=${your_job_name} +job.factory.class=org.apache.samza.job.yarn.YarnJobFactory +job.coordinator.system=${job_coordinator_system} +job.default.system=${job_default_system} +``` + +The config file can be passed to Samza Runner by setting the command line arg `--configFilePath=/path/to/config.properties`. For more details on the Samza configuration, see [Samza Configuration Reference](https://samza.apache.org/learn/documentation/latest/jobs/configuration-table.html). + +## Pipeline options for the Samza Runner + +When executing your pipeline with the Samza Runner, you can use the following pipeline options. + +<table class="table table-bordered"> +<tr> + <th>Field</th> + <th>Description</th> + <th>Default Value</th> +</tr> +<tr> + <td><code>runner</code></td> + <td>The pipeline runner to use. This option allows you to determine the pipeline runner at runtime.</td> + <td>Set to <code>SamzaRunner</code> to run using Samza.</td> +</tr> +<tr> + <td><code>samzaConfig</code></td> + <td>The config for Samza runner.</td> + <td>Config for running locally.</td> +</tr> +<tr> + <td><code>ConfigFilePath</code></td> + <td>The config for Samza runner using a properties file.</td> + <td><code>empty</code>, i.e. use default samzaConfig</td> +</tr> +<tr> + <td><code>watermarkInterval</code></td> + <td>The interval to check for watermarks in milliseconds.</td> + <td><code>1000</code></td> +</tr> +<tr> + <td><code>systemBufferSize</code></td> + <td>The maximum number of messages to buffer for a given system.</td> + <td><code>5000</code></td> +</tr> +<tr> + <td><code>maxSourceParallelism</code></td> + <td>The maximum parallelism allowed for any data source.</td> + <td><code>1</code></td> +</tr> +<tr> + <td><code>storeBatchGetSize</code></td> + <td>The batch get size limit for the state store.</td> + <td><code>10000</code></td> +</tr> +</table> + +## Monitoring your job + +You can monitor your pipeline job using metrics emitted from both Beam and Samza, e.g. Beam source metrics such as `elements_read` and `backlog_elements`, and Samza job metrics such as `job-healthy` and `process-envelopes`. A complete list of Samza metrics is in [Samza Metrics Reference](https://samza.apache.org/learn/documentation/latest/container/metrics-table.html). You can view your job's metrics via JMX in development, and send the metrics to graphing system such as [Graphite](http: [...] + +For a running Samza YARN job, you can use YARN web UI to monitor the job status and check logs. \ No newline at end of file diff --git a/src/get-started/beam-overview.md b/src/get-started/beam-overview.md index 1b2fbfc..68b16de 100644 --- a/src/get-started/beam-overview.md +++ b/src/get-started/beam-overview.md @@ -37,6 +37,7 @@ Beam currently supports Runners that work with the following distributed process * Apache Gearpump (incubating)  * Apache Spark  * Google Cloud Dataflow  +* Apache Samza  **Note:** You can always execute your pipeline locally for testing and debugging purposes. diff --git a/src/get-started/quickstart-java.md b/src/get-started/quickstart-java.md index c80576f..0775ab8 100644 --- a/src/get-started/quickstart-java.md +++ b/src/get-started/quickstart-java.md @@ -113,6 +113,11 @@ $ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ -Pdataflow-runner ``` +{:.runner-samza-local} +``` +$ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ + -Dexec.args="--inputFile=pom.xml --output=/tmp/counts --runner=SamzaRunner" -Psamza-runner +``` ## Inspect the results @@ -148,6 +153,11 @@ $ ls counts* $ gsutil ls gs://<your-gcs-bucket>/counts* ``` +{:.runner-samza-local} +``` +$ ls /tmp/counts* +``` + When you look into the contents of the file, you'll see that they contain unique words and the number of occurrences of each word. The order of elements within the file may differ because the Beam model does not generally guarantee ordering, again to allow runners to optimize for efficiency. {:.runner-direct} @@ -228,6 +238,19 @@ barrenly: 1 ... ``` +{:.runner-samza-local} +``` +$ more /tmp/counts* +api: 7 +are: 2 +can: 2 +com: 14 +end: 14 +for: 14 +has: 2 +... +``` + ## Next Steps * Learn more about the [Beam SDK for Java]({{ site.baseurl }}/documentation/sdks/java/) diff --git a/src/get-started/wordcount-example.md b/src/get-started/wordcount-example.md index 41f21f5..6a7025b 100644 --- a/src/get-started/wordcount-example.md +++ b/src/get-started/wordcount-example.md @@ -365,6 +365,12 @@ $ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ -Pdataflow-runner ``` +{:.runner-samza-local} +``` +$ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ + -Dexec.args="--inputFile=pom.xml --output=counts --runner=SamzaRunner" -Psamza-runner +``` + To view the full code in Java, see **[WordCount](https://github.com/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/WordCount.java).** @@ -406,6 +412,11 @@ python -m apache_beam.examples.wordcount --input gs://dataflow-samples/shakespea --temp_location gs://YOUR_GCS_BUCKET/tmp/ ``` +{:.runner-samza-local} +``` +This runner is not yet available for the Python SDK. +``` + To view the full code in Python, see **[wordcount.py](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/wordcount.py).** @@ -448,6 +459,11 @@ $ wordcount --input gs://dataflow-samples/shakespeare/kinglear.txt \ --worker_harness_container_image=apache-docker-beam-snapshots-docker.bintray.io/beam/go:20180515 ``` +{:.runner-samza-local} +``` +This runner is not yet available for the Go SDK. +``` + To view the full code in Go, see **[wordcount.go](https://github.com/apache/beam/blob/master/sdks/go/examples/wordcount/wordcount.go).** @@ -676,6 +692,12 @@ $ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.DebuggingWordC -Pdataflow-runner ``` +{:.runner-samza-local} +``` +$ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.DebuggingWordCount \ + -Dexec.args="--runner=SamzaRunner --output=counts" -Psamza-runner +``` + To view the full code in Java, see [DebuggingWordCount](https://github.com/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/DebuggingWordCount.java). @@ -717,6 +739,11 @@ python -m apache_beam.examples.wordcount_debugging --input gs://dataflow-samples --temp_location gs://YOUR_GCS_BUCKET/tmp/ ``` +{:.runner-samza-local} +``` +This runner is not yet available for the Python SDK. +``` + To view the full code in Python, see **[wordcount_debugging.py](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/wordcount_debugging.py).** @@ -759,6 +786,11 @@ $ debugging_wordcount --input gs://dataflow-samples/shakespeare/kinglear.txt \ --worker_harness_container_image=apache-docker-beam-snapshots-docker.bintray.io/beam/go:20180515 ``` +{:.runner-samza-local} +``` +This runner is not yet available for the Go SDK. +``` + To view the full code in Go, see **[debugging_wordcount.go](https://github.com/apache/beam/blob/master/sdks/go/examples/debugging_wordcount/debugging_wordcount.go).** @@ -981,6 +1013,12 @@ $ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WindowedWordCo -Pdataflow-runner ``` +{:.runner-samza-local} +``` +$ mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.WindowedWordCount \ + -Dexec.args="--runner=SamzaRunner --inputFile=pom.xml --output=counts" -Psamza-runner +``` + To view the full code in Java, see **[WindowedWordCount](https://github.com/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/WindowedWordCount.java).** @@ -1026,6 +1064,11 @@ python -m apache_beam.examples.windowed_wordcount --input YOUR_INPUT_FILE \ --temp_location gs://YOUR_GCS_BUCKET/tmp/ ``` +{:.runner-samza-local} +``` +This runner is not yet available for the Python SDK. +``` + To view the full code in Python, see **[windowed_wordcount.py](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/windowed_wordcount.py).** @@ -1068,6 +1111,11 @@ $ windowed_wordcount --input gs://dataflow-samples/shakespeare/kinglear.txt \ --worker_harness_container_image=apache-docker-beam-snapshots-docker.bintray.io/beam/go:20180515 ``` +{:.runner-samza-local} +``` +This runner is not yet available for the Go SDK. +``` + To view the full code in Go, see **[windowed_wordcount.go](https://github.com/apache/beam/blob/master/sdks/go/examples/windowed_wordcount/windowed_wordcount.go).** @@ -1294,6 +1342,11 @@ python -m apache_beam.examples.streaming_wordcount \ --streaming ``` +{:.runner-samza-local} +``` +This runner is not yet available for the Python SDK. +``` + To view the full code in Python, see **[streaming_wordcount.py](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/streaming_wordcount.py).** diff --git a/src/images/logo_samza.png b/src/images/logo_samza.png new file mode 100644 index 0000000..88e5ba3 Binary files /dev/null and b/src/images/logo_samza.png differ diff --git a/src/images/logos/runners/samza.png b/src/images/logos/runners/samza.png new file mode 100644 index 0000000..88e5ba3 Binary files /dev/null and b/src/images/logos/runners/samza.png differ diff --git a/src/index.md b/src/index.md index bb5eaec..a71d27e 100644 --- a/src/index.md +++ b/src/index.md @@ -18,6 +18,9 @@ logos: - title: Gearpump image_url: /images/logo_gearpump.png url: http://gearpump.apache.org/ +- title: Samza + image_url: /images/logo_samza.png + url: http://samza.apache.org/ pillars: - title: Unified
