Repository: beam Updated Branches: refs/heads/master 8b2e8f295 -> d7ed2e23a
[BEAM-1320] Fix documentation in the sdk and fail for new errors Project: http://git-wip-us.apache.org/repos/asf/beam/repo Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/228b18e8 Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/228b18e8 Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/228b18e8 Branch: refs/heads/master Commit: 228b18e851e535fecfa3ac5086afd299f91dd1c3 Parents: 8b2e8f2 Author: Sourabh Bajaj <[email protected]> Authored: Wed Feb 22 02:34:54 2017 -0800 Committer: Ahmet Altay <[email protected]> Committed: Wed Feb 22 07:55:44 2017 -0800 ---------------------------------------------------------------------- .../examples/complete/top_wikipedia_sessions.py | 10 ++-- .../examples/cookbook/bigquery_tornadoes.py | 5 +- .../examples/cookbook/multiple_output_pardo.py | 11 +++-- .../apache_beam/examples/snippets/snippets.py | 6 +++ .../io/google_cloud_platform/bigquery.py | 4 +- sdks/python/apache_beam/io/iobase.py | 2 + sdks/python/apache_beam/io/source_test_utils.py | 39 +++++++-------- sdks/python/apache_beam/io/textio.py | 3 +- sdks/python/apache_beam/metrics/execution.py | 1 + sdks/python/apache_beam/metrics/metricbase.py | 3 +- .../runners/direct/bundle_factory.py | 6 ++- sdks/python/apache_beam/transforms/core.py | 4 +- sdks/python/apache_beam/transforms/display.py | 1 + sdks/python/apache_beam/utils/annotations.py | 50 +++++++++++--------- sdks/python/apache_beam/utils/profiler.py | 6 ++- sdks/python/generate_pydoc.sh | 10 +++- 16 files changed, 96 insertions(+), 65 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py index d7fbe30..e6cab18 100644 --- a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py +++ b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py @@ -22,19 +22,21 @@ user with the longest string of edits separated by no more than an hour within each 30 day period. To execute this pipeline locally using the DirectRunner, specify an -output prefix on GCS: +output prefix on GCS::: + --output gs://YOUR_OUTPUT_PREFIX To execute this pipeline using the Google Cloud Dataflow service, specify -pipeline configuration in addition to the above: +pipeline configuration in addition to the above::: + --job_name NAME_FOR_YOUR_JOB --project YOUR_PROJECT_ID --staging_location gs://YOUR_STAGING_DIRECTORY --temp_location gs://YOUR_TEMPORARY_DIRECTORY --runner DataflowRunner -The default input is gs://dataflow-samples/wikipedia_edits/*.json and can be -overridden with --input. +The default input is ``gs://dataflow-samples/wikipedia_edits/*.json`` and can +be overridden with --input. """ from __future__ import absolute_import http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/examples/cookbook/bigquery_tornadoes.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/examples/cookbook/bigquery_tornadoes.py b/sdks/python/apache_beam/examples/cookbook/bigquery_tornadoes.py index 1eade9d..3857111 100644 --- a/sdks/python/apache_beam/examples/cookbook/bigquery_tornadoes.py +++ b/sdks/python/apache_beam/examples/cookbook/bigquery_tornadoes.py @@ -24,8 +24,9 @@ is a boolean field. The workflow will compute the number of tornadoes in each month and output the results to a table (created if needed) with the following schema: - - month: number - - tornado_count: number + +- month: number +- tornado_count: number This example uses the default behavior for BigQuery source and sinks that represents table rows as plain Python dictionaries. http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo.py b/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo.py index 26e97c7..978e4ed 100644 --- a/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo.py +++ b/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo.py @@ -24,25 +24,28 @@ the corresponding result (a PCollection) for that side output. This is a slightly modified version of the basic wordcount example. In this example words are divided into 2 buckets as shorts words (3 characters in length -or less) and words (all other words). There will be 3 output files: +or less) and words (all other words). There will be 3 output files::: [OUTPUT]-chars : Character count for the input. [OUTPUT]-short-words : Word count for short words only. [OUTPUT]-words : Word count for all other words. To execute this pipeline locally, specify a local output file or output prefix -on GCS: +on GCS::: + --output [YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX] To execute this pipeline using the Google Cloud Dataflow service, specify -pipeline configuration: +pipeline configuration::: + --project YOUR_PROJECT_ID --staging_location gs://YOUR_STAGING_DIRECTORY --temp_location gs://YOUR_TEMP_DIRECTORY --job_name YOUR_JOB_NAME --runner DataflowRunner -and an output prefix on GCS: +and an output prefix on GCS::: + --output gs://YOUR_OUTPUT_PREFIX """ http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/examples/snippets/snippets.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/examples/snippets/snippets.py b/sdks/python/apache_beam/examples/snippets/snippets.py index e7f28b0..b5dfe8f 100644 --- a/sdks/python/apache_beam/examples/snippets/snippets.py +++ b/sdks/python/apache_beam/examples/snippets/snippets.py @@ -583,6 +583,7 @@ def model_custom_source(count): distributed to a large number of end users. This method runs two pipelines. + (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read`` transform. (2) A pipeline that uses a custom ``PTransform`` that wraps @@ -591,6 +592,7 @@ def model_custom_source(count): Args: count: the size of the counting source to be used in the pipeline demonstrated in this method. + """ import apache_beam as beam @@ -708,6 +710,7 @@ def model_custom_sink(simplekv, KVs, final_table_name_no_ptransform, distributed to a large number of end users. This method runs two pipelines. + (1) A pipeline that uses ``SimpleKVSink`` directly using the ``df.Write`` transform. (2) A pipeline that uses a custom ``PTransform`` that wraps @@ -715,10 +718,13 @@ def model_custom_sink(simplekv, KVs, final_table_name_no_ptransform, Args: simplekv: an object that mocks the key-value storage. + KVs: the set of key-value pairs to be written in the example pipeline. + final_table_name_no_ptransform: the prefix of final set of tables to be created by the example pipeline that uses ``SimpleKVSink`` directly. + final_table_name_with_ptransform: the prefix of final set of tables to be created by the example pipeline that uses a ``PTransform`` that wraps http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/io/google_cloud_platform/bigquery.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/io/google_cloud_platform/bigquery.py b/sdks/python/apache_beam/io/google_cloud_platform/bigquery.py index a5d5ab2..56b3e46 100644 --- a/sdks/python/apache_beam/io/google_cloud_platform/bigquery.py +++ b/sdks/python/apache_beam/io/google_cloud_platform/bigquery.py @@ -43,7 +43,7 @@ completely every time a ParDo DoFn gets executed. In the example below the lambda function implementing the DoFn for the Map transform will get on each call *one* row of the main table and *all* rows of the side table. The runner may use some caching techniques to share the side inputs between calls in order -to avoid excessive reading: +to avoid excessive reading::: main_table = pipeline | 'very_big' >> beam.io.Read(beam.io.BigQuerySource() side_table = pipeline | 'not_big' >> beam.io.Read(beam.io.BigQuerySource() @@ -69,7 +69,7 @@ a query. Users may provide a query to read from rather than reading all of a BigQuery table. If specified, the result obtained by executing the specified query will -be used as the data of the input transform. +be used as the data of the input transform.:: query_results = pipeline | beam.io.Read(beam.io.BigQuerySource( query='SELECT year, mean_temp FROM samples.weather_stations')) http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/io/iobase.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/io/iobase.py b/sdks/python/apache_beam/io/iobase.py index 2905111..bd56466 100644 --- a/sdks/python/apache_beam/io/iobase.py +++ b/sdks/python/apache_beam/io/iobase.py @@ -86,6 +86,7 @@ class BoundedSource(HasDisplayData): ``RangeTracker``. A runner will perform reading the source in two steps. + (1) Method ``get_range_tracker()`` will be invoked with start and end positions to obtain a ``RangeTracker`` for the range of positions the runner intends to read. Source must define a default initial start and end @@ -137,6 +138,7 @@ class BoundedSource(HasDisplayData): Framework may invoke ``read()`` method with the RangeTracker object returned here to read data from the source. + Args: start_position: starting position of the range. If 'None' default start position of the source must be used. http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/io/source_test_utils.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/io/source_test_utils.py b/sdks/python/apache_beam/io/source_test_utils.py index f5e599f..07de738 100644 --- a/sdks/python/apache_beam/io/source_test_utils.py +++ b/sdks/python/apache_beam/io/source_test_utils.py @@ -364,28 +364,29 @@ def assertSplitAtFractionSucceedsAndConsistent(source, split_fraction): """Verifies some consistency properties of dynamic work rebalancing. - Equivalent to the following pseudocode: - - original_range_tracker = source.getRangeTracker(None, None) - original_reader = source.read(original_range_tracker) - items_before_split = read N items from original_reader - suggested_split_position = original_range_tracker.position_for_fraction( - split_fraction) - original_stop_position - original_range_tracker.stop_position() - split_result = range_tracker.try_split() - split_position, split_fraction = split_result - primary_range_tracker = source.get_range_tracker( - original_range_tracker.start_position(), split_position) - residual_range_tracker = source.get_range_tracker(split_position, - original_stop_position) - - assert that: items when reading source.read(primary_range_tracker) == - items_before_split + items from continuing to read 'original_reader' - assert that: items when reading source.read(original_range_tracker) = - items when reading source.read(primary_range_tracker) + items when reading + Equivalent to the following pseudocode::: + + original_range_tracker = source.getRangeTracker(None, None) + original_reader = source.read(original_range_tracker) + items_before_split = read N items from original_reader + suggested_split_position = original_range_tracker.position_for_fraction( + split_fraction) + original_stop_position - original_range_tracker.stop_position() + split_result = range_tracker.try_split() + split_position, split_fraction = split_result + primary_range_tracker = source.get_range_tracker( + original_range_tracker.start_position(), split_position) + residual_range_tracker = source.get_range_tracker(split_position, + original_stop_position) + + assert that: items when reading source.read(primary_range_tracker) == + items_before_split + items from continuing to read 'original_reader' + assert that: items when reading source.read(original_range_tracker) = + items when reading source.read(primary_range_tracker) + items when reading source.read(residual_range_tracker) Args: + source: source to perform dynamic work rebalancing on. num_items_to_read_before_split: number of items to read before splitting. split_fraction: fraction to split at. http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/io/textio.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/io/textio.py b/sdks/python/apache_beam/io/textio.py index a06ad4b..2ddaf02 100644 --- a/sdks/python/apache_beam/io/textio.py +++ b/sdks/python/apache_beam/io/textio.py @@ -336,7 +336,8 @@ class ReadFromText(PTransform): UTF-8 encoding. Supports newline delimiters '\n' and '\r\n'. This implementation only supports reading text encoded using UTF-8 or ASCII. - This does not support other encodings such as UTF-16 or UTF-32.""" + This does not support other encodings such as UTF-16 or UTF-32. + """ def __init__( self, file_pattern=None, http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/metrics/execution.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/metrics/execution.py b/sdks/python/apache_beam/metrics/execution.py index 3ba1735..f6c8990 100644 --- a/sdks/python/apache_beam/metrics/execution.py +++ b/sdks/python/apache_beam/metrics/execution.py @@ -21,6 +21,7 @@ Internal classes for Metrics API. The classes in this file keep shared state, and organize metrics information. Available classes: + - MetricKey - Internal key for a metric. - MetricResult - Current status of a metric's updates/commits. - MetricsEnvironment - Keeps track of MetricsContainer and other metrics http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/metrics/metricbase.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/metrics/metricbase.py b/sdks/python/apache_beam/metrics/metricbase.py index 1ad6962..fa0ca75 100644 --- a/sdks/python/apache_beam/metrics/metricbase.py +++ b/sdks/python/apache_beam/metrics/metricbase.py @@ -18,9 +18,10 @@ """ The classes in this file are interfaces for metrics. They are not intended to be subclassed or created directly by users. To work with and access metrics, - users should use the classes and methods exposed in metric.py. +users should use the classes and methods exposed in metric.py. Available classes: + - Metric - Base interface of a metrics object. - Counter - Counter metric interface. Allows a count to be incremented or decremented during pipeline execution. http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/runners/direct/bundle_factory.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/runners/direct/bundle_factory.py b/sdks/python/apache_beam/runners/direct/bundle_factory.py index 38a0484..63319af 100644 --- a/sdks/python/apache_beam/runners/direct/bundle_factory.py +++ b/sdks/python/apache_beam/runners/direct/bundle_factory.py @@ -62,7 +62,8 @@ class Bundle(object): to restore WindowedValues upon get_elements() call. When this optimization is not desired, it can be avoided by an option when - creating bundles, like: + creating bundles, like::: + b = Bundle(stacked=False) """ @@ -71,7 +72,8 @@ class Bundle(object): It must be initialized from a single WindowedValue. - Example: + Example::: + s = StackedWindowedValues(windowed_value) if (another_windowed_value.timestamp == s.timestamp and another_windowed_value.windows == s.windows): http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/transforms/core.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/transforms/core.py b/sdks/python/apache_beam/transforms/core.py index 2efe38d..7abd784 100644 --- a/sdks/python/apache_beam/transforms/core.py +++ b/sdks/python/apache_beam/transforms/core.py @@ -142,8 +142,6 @@ class DoFn(WithTypeHints, HasDisplayData): Args: element: The element to be processed - context: a DoFnProcessContext object containing. See the - DoFnProcessContext documentation for details. *args: side inputs **kwargs: keyword side inputs """ @@ -915,7 +913,7 @@ class CombinePerKey(PTransformWithSideInputs): Args: pcoll: input pcollection. fn: instance of CombineFn to apply to all values under the same key in - pcoll, or a callable whose signature is f(iterable, *args, **kwargs) + pcoll, or a callable whose signature is ``f(iterable, *args, **kwargs)`` (e.g., sum, max). *args: arguments and side inputs, passed directly to the CombineFn. **kwargs: arguments and side inputs, passed directly to the CombineFn. http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/transforms/display.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/transforms/display.py b/sdks/python/apache_beam/transforms/display.py index 6e74512..5e25060 100644 --- a/sdks/python/apache_beam/transforms/display.py +++ b/sdks/python/apache_beam/transforms/display.py @@ -25,6 +25,7 @@ add static display data to a component, you can override the display_data method of the HasDisplayData class. Available classes: + - HasDisplayData - Components that inherit from this class can have static display data shown in the UI. - DisplayDataItem - This class represents static display data elements. http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/utils/annotations.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/utils/annotations.py b/sdks/python/apache_beam/utils/annotations.py index aa53554..1ec0848 100644 --- a/sdks/python/apache_beam/utils/annotations.py +++ b/sdks/python/apache_beam/utils/annotations.py @@ -25,33 +25,37 @@ Both 'deprecated' and 'experimental' annotations can specify the current recommended version to use by means of a 'current' parameter. The following example illustrates how to annotate coexisting versions of the -same function 'multiply'. -def multiply(arg1, arg2): - print arg1, '*', arg2, '=', - return arg1*arg2 +same function 'multiply'.:: + + def multiply(arg1, arg2): + print arg1, '*', arg2, '=', + return arg1*arg2 # This annotation marks 'old_multiply' as deprecated since 'v.1' and suggests -# using 'multiply' instead. -@deprecated(since='v.1', current='multiply') -def old_multiply(arg1, arg2): - result = 0 - for i in xrange(arg1): - result += arg2 - print arg1, '*', arg2, '(the old way)=', - return result +# using 'multiply' instead.:: + + @deprecated(since='v.1', current='multiply') + def old_multiply(arg1, arg2): + result = 0 + for i in xrange(arg1): + result += arg2 + print arg1, '*', arg2, '(the old way)=', + return result # This annotation marks 'exp_multiply' as experimental and suggests -# using 'multiply' instead. -@experimental(since='v.1', current='multiply') -def exp_multiply(arg1, arg2): - print arg1, '*', arg2, '(the experimental way)=', - return (arg1*arg2)*(arg1/arg2)*(arg2/arg1) - -# Set a warning filter to control how often warnings are produced -warnings.simplefilter("always") -print multiply(5, 6) -print old_multiply(5,6) -print exp_multiply(5,6) +# using 'multiply' instead.:: + + @experimental(since='v.1', current='multiply') + def exp_multiply(arg1, arg2): + print arg1, '*', arg2, '(the experimental way)=', + return (arg1*arg2)*(arg1/arg2)*(arg2/arg1) + +# Set a warning filter to control how often warnings are produced.:: + + warnings.simplefilter("always") + print multiply(5, 6) + print old_multiply(5,6) + print exp_multiply(5,6) """ import warnings http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/apache_beam/utils/profiler.py ---------------------------------------------------------------------- diff --git a/sdks/python/apache_beam/utils/profiler.py b/sdks/python/apache_beam/utils/profiler.py index 3599f98..2a2df17 100644 --- a/sdks/python/apache_beam/utils/profiler.py +++ b/sdks/python/apache_beam/utils/profiler.py @@ -72,7 +72,8 @@ class Profile(object): class MemoryReporter(object): """A memory reporter that reports the memory usage and heap profile. - Usage: + Usage::: + mr = MemoryReporter(interval_second=30.0) mr.start() while ... @@ -88,7 +89,8 @@ class MemoryReporter(object): while ... <do some thing> - Also it could report on demand without continuous reporting. + Also it could report on demand without continuous reporting.:: + mr = MemoryReporter() # default interval 60s but not started. <do something> mr.report_once() http://git-wip-us.apache.org/repos/asf/beam/blob/228b18e8/sdks/python/generate_pydoc.sh ---------------------------------------------------------------------- diff --git a/sdks/python/generate_pydoc.sh b/sdks/python/generate_pydoc.sh index f96a649..d680f30 100755 --- a/sdks/python/generate_pydoc.sh +++ b/sdks/python/generate_pydoc.sh @@ -29,7 +29,10 @@ set -e mkdir -p target/docs/source # Exclude autogenerated API message definition files that aren't part of SDK. -excluded_internal_clients=(apache_beam/internal/clients/) +excluded_internal_clients=( + apache_beam/internal/clients/ + apache_beam/io/google_cloud_platform/internal/clients/ + apache_beam/runners/google_cloud_dataflow/internal/clients/) python $(type -p sphinx-apidoc) -f -o target/docs/source apache_beam \ "${excluded_internal_clients[@]}" @@ -59,8 +62,11 @@ EOF # Build the documentation using sphinx python $(type -p sphinx-build) -q target/docs/source target/docs/_build -c target/docs/source \ - -w "/tmp/sphinx-build.warnings.log" + -w "target/docs/sphinx-build.warnings.log" # Message is useful only when this script is run locally. In a remote # test environment, this path will be removed when the test completes. echo "Browse to file://$PWD/target/docs/_build/index.html" + +# Fail if there are errors in docs +! grep -q "ERROR:" target/docs/sphinx-build.warnings.log
