This is an automated email from the ASF dual-hosted git repository.
altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 41dd4bf [BEAM-7389] Update to use util.Regex transform
new da6c1a8 Merge pull request #9435 from davidcavazos/regex-sample
41dd4bf is described below
commit 41dd4bf8b106dcbc48146e174ca468ff90d3cdfc
Author: David Cavazos
AuthorDate: Mon Aug 26 16:38:42 2019 -0700
[BEAM-7389] Update to use util.Regex transform
---
.../snippets/transforms/element_wise/regex.py | 235 ++---
.../snippets/transforms/element_wise/regex_test.py | 210 +++---
2 files changed, 289 insertions(+), 156 deletions(-)
diff --git
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
index 44aa9629..975d5d3 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
@@ -20,142 +20,217 @@ from __future__ import absolute_import
from __future__ import print_function
-def regex_match(test=None):
- # [START regex_match]
+def regex_matches(test=None):
+ # [START regex_matches]
import apache_beam as beam
- import re
-
- def parse_plant(text):
-m = re.match(r'^([^\s-]+)\s*-\s*(\w+)\s*-\s*(?P\w+)$', text)
-if m:
- yield {
- 'match': m.group(0), # contains the entire matched text
- 'icon': m.group(1), # ([^\s-]+) - group
- 'name': m.group(2), # (\w+) - group
- 'duration': m.group('duration'), # (?P\w+) - named group
- }
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P[^ ,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
-plant_matches = (
+plants_matches = (
pipeline
| 'Garden plants' >> beam.Create([
-' - Strawberry - perennial',
-'凌 - Carrot - biennial',
-'# - invalid - format',
-'\t-\tEggplant\t-\tperennial',
-' - Tomato - annual',
-' - invalid - format with trailing words',
-'凜-Potato-perennial',
+', Strawberry, perennial',
+'凌, Carrot, biennial ignoring trailing words',
+', Eggplant, perennial',
+', Tomato, annual',
+'凜,Potato,perennial',
+'# , invalid, format',
+'invalid, , format',
])
-| 'Parse plants' >> beam.FlatMap(parse_plant)
+| 'Parse plants' >> beam.Regex.matches(regex)
| beam.Map(print)
)
-# [END regex_match]
+# [END regex_matches]
if test:
- test(plant_matches)
+ test(plants_matches)
-def regex_search(test=None):
- # [START regex_search]
+def regex_all_matches(test=None):
+ # [START regex_all_matches]
import apache_beam as beam
- import re
-
- def parse_plant_duration(text):
-m = re.search(r'([^\s-]+)\s*-\s*(\w*)\s*-\s*(?P\w+)', text)
-if m:
- yield {
- 'match': m.group(0), # contains the entire matched text
- 'icon': m.group(1), # ([^\s-]+) - group
- 'name': m.group(2), # (\w+) - group
- 'duration': m.group('duration'), # (?P\w+) - named group
- }
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P[^ ,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
-plant_matches = (
+plants_all_matches = (
pipeline
| 'Garden plants' >> beam.Create([
-'# - Strawberry - perennial',
-'# 凌 - Carrot - biennial',
-'# \t-\tEggplant\t-\tperennial',
-'# - Tomato - annual',
-'# 凜-Potato-perennial',
+', Strawberry, perennial',
+'凌, Carrot, biennial ignoring trailing words',
+', Eggplant, perennial',
+', Tomato, annual',
+'凜,Potato,perennial',
+'# , invalid, format',
+'invalid, , format',
])
-| 'Parse plants' >> beam.FlatMap(parse_plant_duration)
+| 'Parse plants' >> beam.Regex.all_matches(regex)
| beam.Map(print)
)
-# [END regex_search]
+# [END regex_all_matches]
if test:
- test(plant_matches)
+ test(plants_all_matches)
+
+
+def regex_matches_kv(test=None):
+ # [START regex_matches_kv]
+ import apache_beam as beam
+
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P[^ ,]+), *(\w+), *(\w+)'
+ with beam.Pipeline() as pipeline:
+plants_matches_kv = (
+pipeline
+| 'Garden plants' >> beam.Create([
+', Strawberry, perennial',
+'凌, Carrot, biennial