This is an automated email from the ASF dual-hosted git repository.
altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 41dd4bf [BEAM-7389] Update to use util.Regex transform
new da6c1a8 Merge pull request #9435 from davidcavazos/regex-sample
41dd4bf is described below
commit 41dd4bf8b106dcbc48146e174ca468ff90d3cdfc
Author: David Cavazos <[email protected]>
AuthorDate: Mon Aug 26 16:38:42 2019 -0700
[BEAM-7389] Update to use util.Regex transform
---
.../snippets/transforms/element_wise/regex.py | 235 ++++++++++++++-------
.../snippets/transforms/element_wise/regex_test.py | 210 +++++++++++-------
2 files changed, 289 insertions(+), 156 deletions(-)
diff --git
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
index 44aa9629..975d5d3 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
@@ -20,142 +20,217 @@ from __future__ import absolute_import
from __future__ import print_function
-def regex_match(test=None):
- # [START regex_match]
+def regex_matches(test=None):
+ # [START regex_matches]
import apache_beam as beam
- import re
-
- def parse_plant(text):
- m = re.match(r'^([^\s-]+)\s*-\s*(\w+)\s*-\s*(?P<duration>\w+)$', text)
- if m:
- yield {
- 'match': m.group(0), # contains the entire matched text
- 'icon': m.group(1), # ([^\s-]+) - group
- 'name': m.group(2), # (\w+) - group
- 'duration': m.group('duration'), # (?P<duration>\w+) - named group
- }
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
- plant_matches = (
+ plants_matches = (
pipeline
| 'Garden plants' >> beam.Create([
- '🍓 - Strawberry - perennial',
- '🥕 - Carrot - biennial',
- '# 🍌 - invalid - format',
- '🍆\t-\tEggplant\t-\tperennial',
- '🍅 - Tomato - annual',
- '🍉 - invalid - format with trailing words',
- '🥔-Potato-perennial',
+ '🍓, Strawberry, perennial',
+ '🥕, Carrot, biennial ignoring trailing words',
+ '🍆, Eggplant, perennial',
+ '🍅, Tomato, annual',
+ '🥔,Potato,perennial',
+ '# 🍌, invalid, format',
+ 'invalid, 🍉, format',
])
- | 'Parse plants' >> beam.FlatMap(parse_plant)
+ | 'Parse plants' >> beam.Regex.matches(regex)
| beam.Map(print)
)
- # [END regex_match]
+ # [END regex_matches]
if test:
- test(plant_matches)
+ test(plants_matches)
-def regex_search(test=None):
- # [START regex_search]
+def regex_all_matches(test=None):
+ # [START regex_all_matches]
import apache_beam as beam
- import re
-
- def parse_plant_duration(text):
- m = re.search(r'([^\s-]+)\s*-\s*(\w*)\s*-\s*(?P<duration>\w+)', text)
- if m:
- yield {
- 'match': m.group(0), # contains the entire matched text
- 'icon': m.group(1), # ([^\s-]+) - group
- 'name': m.group(2), # (\w+) - group
- 'duration': m.group('duration'), # (?P<duration>\w+) - named group
- }
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
- plant_matches = (
+ plants_all_matches = (
pipeline
| 'Garden plants' >> beam.Create([
- '# 🍓 - Strawberry - perennial',
- '# 🥕 - Carrot - biennial',
- '# 🍆\t-\tEggplant\t-\tperennial',
- '# 🍅 - Tomato - annual',
- '# 🥔-Potato-perennial',
+ '🍓, Strawberry, perennial',
+ '🥕, Carrot, biennial ignoring trailing words',
+ '🍆, Eggplant, perennial',
+ '🍅, Tomato, annual',
+ '🥔,Potato,perennial',
+ '# 🍌, invalid, format',
+ 'invalid, 🍉, format',
])
- | 'Parse plants' >> beam.FlatMap(parse_plant_duration)
+ | 'Parse plants' >> beam.Regex.all_matches(regex)
| beam.Map(print)
)
- # [END regex_search]
+ # [END regex_all_matches]
if test:
- test(plant_matches)
+ test(plants_all_matches)
+
+
+def regex_matches_kv(test=None):
+ # [START regex_matches_kv]
+ import apache_beam as beam
+
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
+ with beam.Pipeline() as pipeline:
+ plants_matches_kv = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '🍓, Strawberry, perennial',
+ '🥕, Carrot, biennial ignoring trailing words',
+ '🍆, Eggplant, perennial',
+ '🍅, Tomato, annual',
+ '🥔,Potato,perennial',
+ '# 🍌, invalid, format',
+ 'invalid, 🍉, format',
+ ])
+ | 'Parse plants' >> beam.Regex.matches_kv(regex, keyGroup='icon')
+ | beam.Map(print)
+ )
+ # [END regex_matches_kv]
+ if test:
+ test(plants_matches_kv)
+
+
+def regex_find(test=None):
+ # [START regex_find]
+ import apache_beam as beam
+
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
+ with beam.Pipeline() as pipeline:
+ plants_matches = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '# 🍓, Strawberry, perennial',
+ '# 🥕, Carrot, biennial ignoring trailing words',
+ '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
+ '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
+ '# 🥔,Potato,perennial',
+ ])
+ | 'Parse plants' >> beam.Regex.find(regex)
+ | beam.Map(print)
+ )
+ # [END regex_find]
+ if test:
+ test(plants_matches)
def regex_find_all(test=None):
# [START regex_find_all]
import apache_beam as beam
- import re
-
- def parse_words(text):
- for m in re.finditer(r'[^\s-]+', text):
- yield m.group()
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
- words = (
+ plants_find_all = (
pipeline
| 'Garden plants' >> beam.Create([
- '🍓 - Strawberry - perennial',
- '🥕 - Carrot - biennial',
- '🍆\t-\tEggplant\t-\tperennial',
- '🍅 - Tomato - annual',
- '🥔-Potato-perennial',
+ '# 🍓, Strawberry, perennial',
+ '# 🥕, Carrot, biennial ignoring trailing words',
+ '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
+ '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
+ '# 🥔,Potato,perennial',
])
- | 'Parse words' >> beam.FlatMap(parse_words)
+ | 'Parse plants' >> beam.Regex.find_all(regex)
| beam.Map(print)
)
# [END regex_find_all]
if test:
- test(words)
+ test(plants_find_all)
+
+
+def regex_find_kv(test=None):
+ # [START regex_find_kv]
+ import apache_beam as beam
+
+ # Matches a named group 'icon', and then two comma-separated groups.
+ regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
+ with beam.Pipeline() as pipeline:
+ plants_matches_kv = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '# 🍓, Strawberry, perennial',
+ '# 🥕, Carrot, biennial ignoring trailing words',
+ '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
+ '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
+ '# 🥔,Potato,perennial',
+ ])
+ | 'Parse plants' >> beam.Regex.find_kv(regex, keyGroup='icon')
+ | beam.Map(print)
+ )
+ # [END regex_find_kv]
+ if test:
+ test(plants_matches_kv)
+
+
+def regex_replace_all(test=None):
+ # [START regex_replace_all]
+ import apache_beam as beam
+
+ with beam.Pipeline() as pipeline:
+ plants_replace_all = (
+ pipeline
+ | 'Garden plants' >> beam.Create([
+ '🍓 : Strawberry : perennial',
+ '🥕 : Carrot : biennial',
+ '🍆\t\t:\t\tEggplant\t\t:\t\tperennial',
+ '🍅 : Tomato : annual',
+ '🥔:Potato:perennial',
+ ])
+ | 'To CSV' >> beam.Regex.replace_all(r'\s*:\s*', ',')
+ | beam.Map(print)
+ )
+ # [END regex_replace_all]
+ if test:
+ test(plants_replace_all)
-def regex_replace(test=None):
- # [START regex_replace]
+def regex_replace_first(test=None):
+ # [START regex_replace_first]
import apache_beam as beam
- import re
with beam.Pipeline() as pipeline:
- plants_csv = (
+ plants_replace_first = (
pipeline
| 'Garden plants' >> beam.Create([
- '🍓 - Strawberry - perennial',
- '🥕 - Carrot - biennial',
- '🍆\t-\tEggplant\t-\tperennial',
- '🍅 - Tomato - annual',
- '🥔-Potato-perennial',
+ '🍓 , Strawberry, perennial',
+ '🥕, Carrot, biennial',
+ '🍆\t\t,\t\tEggplant, perennial',
+ '🍅, Tomato, annual',
+ '🥔,Potato, perennial',
])
- | 'To CSV' >> beam.Map(lambda text: re.sub(r'\s*-\s*', ',', text))
+ | 'As dictionary' >> beam.Regex.replace_first(r'\s*,\s*', ': ')
| beam.Map(print)
)
- # [END regex_replace]
+ # [END regex_replace_first]
if test:
- test(plants_csv)
+ test(plants_replace_first)
def regex_split(test=None):
# [START regex_split]
import apache_beam as beam
- import re
with beam.Pipeline() as pipeline:
- plants_columns = (
+ plants_split = (
pipeline
| 'Garden plants' >> beam.Create([
- '🍓 - Strawberry - perennial',
- '🥕 - Carrot - biennial',
- '🍆\t-\tEggplant\t-\tperennial',
- '🍅 - Tomato - annual',
- '🥔-Potato-perennial',
+ '🍓 : Strawberry : perennial',
+ '🥕 : Carrot : biennial',
+ '🍆\t\t:\t\tEggplant : perennial',
+ '🍅 : Tomato : annual',
+ '🥔:Potato:perennial',
])
- | 'Split' >> beam.Map(lambda text: re.split(r'\s*-\s*', text))
+ | 'Parse plants' >> beam.Regex.split(r'\s*:\s*')
| beam.Map(print)
)
# [END regex_split]
if test:
- test(plants_columns)
+ test(plants_split)
diff --git
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
index 27c9524..df4fc39 100644
---
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
+++
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
@@ -23,92 +23,150 @@ import unittest
import mock
-from apache_beam.examples.snippets.transforms.element_wise.regex import *
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to
+from . import regex
+
+
+def check_matches(actual):
+ # [START plants_matches]
+ plants_matches = [
+ '🍓, Strawberry, perennial',
+ '🥕, Carrot, biennial',
+ '🍆, Eggplant, perennial',
+ '🍅, Tomato, annual',
+ '🥔,Potato,perennial',
+ ]
+ # [END plants_matches]
+ assert_that(actual, equal_to(plants_matches))
+
+
+def check_all_matches(actual):
+ # [START plants_all_matches]
+ plants_all_matches = [
+ ['🍓, Strawberry, perennial', '🍓', 'Strawberry', 'perennial'],
+ ['🥕, Carrot, biennial', '🥕', 'Carrot', 'biennial'],
+ ['🍆, Eggplant, perennial', '🍆', 'Eggplant', 'perennial'],
+ ['🍅, Tomato, annual', '🍅', 'Tomato', 'annual'],
+ ['🥔,Potato,perennial', '🥔', 'Potato', 'perennial'],
+ ]
+ # [END plants_all_matches]
+ assert_that(actual, equal_to(plants_all_matches))
+
+
+def check_matches_kv(actual):
+ # [START plants_matches_kv]
+ plants_matches_kv = [
+ ('🍓', '🍓, Strawberry, perennial'),
+ ('🥕', '🥕, Carrot, biennial'),
+ ('🍆', '🍆, Eggplant, perennial'),
+ ('🍅', '🍅, Tomato, annual'),
+ ('🥔', '🥔,Potato,perennial'),
+ ]
+ # [END plants_matches_kv]
+ assert_that(actual, equal_to(plants_matches_kv))
+
+
+def check_find_all(actual):
+ # [START plants_find_all]
+ plants_find_all = [
+ ['🍓, Strawberry, perennial'],
+ ['🥕, Carrot, biennial'],
+ ['🍆, Eggplant, perennial', '🍌, Banana, perennial'],
+ ['🍅, Tomato, annual', '🍉, Watermelon, annual'],
+ ['🥔,Potato,perennial'],
+ ]
+ # [END plants_find_all]
+ assert_that(actual, equal_to(plants_find_all))
+
+
+def check_find_kv(actual):
+ # [START plants_find_kv]
+ plants_find_all = [
+ ('🍓', '🍓, Strawberry, perennial'),
+ ('🥕', '🥕, Carrot, biennial'),
+ ('🍆', '🍆, Eggplant, perennial'),
+ ('🍌', '🍌, Banana, perennial'),
+ ('🍅', '🍅, Tomato, annual'),
+ ('🍉', '🍉, Watermelon, annual'),
+ ('🥔', '🥔,Potato,perennial'),
+ ]
+ # [END plants_find_kv]
+ assert_that(actual, equal_to(plants_find_all))
+
+
+def check_replace_all(actual):
+ # [START plants_replace_all]
+ plants_replace_all = [
+ '🍓,Strawberry,perennial',
+ '🥕,Carrot,biennial',
+ '🍆,Eggplant,perennial',
+ '🍅,Tomato,annual',
+ '🥔,Potato,perennial',
+ ]
+ # [END plants_replace_all]
+ assert_that(actual, equal_to(plants_replace_all))
+
+
+def check_replace_first(actual):
+ # [START plants_replace_first]
+ plants_replace_first = [
+ '🍓: Strawberry, perennial',
+ '🥕: Carrot, biennial',
+ '🍆: Eggplant, perennial',
+ '🍅: Tomato, annual',
+ '🥔: Potato, perennial',
+ ]
+ # [END plants_replace_first]
+ assert_that(actual, equal_to(plants_replace_first))
+
+
+def check_split(actual):
+ # [START plants_split]
+ plants_split = [
+ ['🍓', 'Strawberry', 'perennial'],
+ ['🥕', 'Carrot', 'biennial'],
+ ['🍆', 'Eggplant', 'perennial'],
+ ['🍅', 'Tomato', 'annual'],
+ ['🥔', 'Potato', 'perennial'],
+ ]
+ # [END plants_replace_first]
+ assert_that(actual, equal_to(plants_split))
+
@mock.patch('apache_beam.Pipeline', TestPipeline)
# pylint: disable=line-too-long
@mock.patch('apache_beam.examples.snippets.transforms.element_wise.regex.print',
lambda elem: elem)
# pylint: enable=line-too-long
class RegexTest(unittest.TestCase):
- def __init__(self, methodName):
- super(RegexTest, self).__init__(methodName)
- # pylint: disable=line-too-long
- # [START plant_matches]
- plant_matches = [
- {'match': '🍓 - Strawberry - perennial', 'icon': '🍓', 'name':
'Strawberry', 'duration': 'perennial'},
- {'match': '🥕 - Carrot - biennial', 'icon': '🥕', 'name': 'Carrot',
'duration': 'biennial'},
- {'match': '🍆\t-\tEggplant\t-\tperennial', 'icon': '🍆', 'name':
'Eggplant', 'duration': 'perennial'},
- {'match': '🍅 - Tomato - annual', 'icon': '🍅', 'name': 'Tomato',
'duration': 'annual'},
- {'match': '🥔-Potato-perennial', 'icon': '🥔', 'name': 'Potato',
'duration': 'perennial'},
- ]
- # [END plant_matches]
- # pylint: enable=line-too-long
- self.plant_matches_test = lambda actual: \
- assert_that(actual, equal_to(plant_matches))
-
- # [START words]
- words = [
- '🍓',
- 'Strawberry',
- 'perennial',
- '🥕',
- 'Carrot',
- 'biennial',
- '🍆',
- 'Eggplant',
- 'perennial',
- '🍅',
- 'Tomato',
- 'annual',
- '🥔',
- 'Potato',
- 'perennial',
- ]
- # [END words]
- self.words_test = lambda actual: assert_that(actual, equal_to(words))
-
- # [START plants_csv]
- plants_csv = [
- '🍓,Strawberry,perennial',
- '🥕,Carrot,biennial',
- '🍆,Eggplant,perennial',
- '🍅,Tomato,annual',
- '🥔,Potato,perennial',
- ]
- # [END plants_csv]
- self.plants_csv_test = lambda actual: \
- assert_that(actual, equal_to(plants_csv))
-
- # [START plants_columns]
- plants_columns = [
- ['🍓', 'Strawberry', 'perennial'],
- ['🥕', 'Carrot', 'biennial'],
- ['🍆', 'Eggplant', 'perennial'],
- ['🍅', 'Tomato', 'annual'],
- ['🥔', 'Potato', 'perennial'],
- ]
- # [END plants_columns]
- self.plants_columns_test = lambda actual: \
- assert_that(actual, equal_to(plants_columns))
-
- def test_regex_match(self):
- regex_match(self.plant_matches_test)
-
- def test_regex_search(self):
- regex_search(self.plant_matches_test)
-
- def test_regex_find_all(self):
- regex_find_all(self.words_test)
-
- def test_regex_replace(self):
- regex_replace(self.plants_csv_test)
-
- def test_regex_split(self):
- regex_split(self.plants_columns_test)
+ def test_matches(self):
+ regex.regex_matches(check_matches)
+
+ def test_all_matches(self):
+ regex.regex_all_matches(check_all_matches)
+
+ def test_matches_kv(self):
+ regex.regex_matches_kv(check_matches_kv)
+
+ def test_find(self):
+ regex.regex_find(check_matches)
+
+ def test_find_all(self):
+ regex.regex_find_all(check_find_all)
+
+ def test_find_kv(self):
+ regex.regex_find_kv(check_find_kv)
+
+ def test_replace_all(self):
+ regex.regex_replace_all(check_replace_all)
+
+ def test_replace_first(self):
+ regex.regex_replace_first(check_replace_first)
+
+ def test_split(self):
+ regex.regex_split(check_split)
if __name__ == '__main__':