[beam] branch master updated: [BEAM-7389] Update to use util.Regex transform

altay Mon, 26 Aug 2019 19:07:01 -0700

This is an automated email from the ASF dual-hosted git repository.

altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git



The following commit(s) were added to refs/heads/master by this push:
     new 41dd4bf  [BEAM-7389] Update to use util.Regex transform
     new da6c1a8  Merge pull request #9435 from davidcavazos/regex-sample
41dd4bf is described below

commit 41dd4bf8b106dcbc48146e174ca468ff90d3cdfc
Author: David Cavazos <[email protected]>
AuthorDate: Mon Aug 26 16:38:42 2019 -0700

    [BEAM-7389] Update to use util.Regex transform
---
 .../snippets/transforms/element_wise/regex.py      | 235 ++++++++++++++-------
 .../snippets/transforms/element_wise/regex_test.py | 210 +++++++++++-------
 2 files changed, 289 insertions(+), 156 deletions(-)

diff --git 
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py 
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
index 44aa9629..975d5d3 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
@@ -20,142 +20,217 @@ from __future__ import absolute_import
 from __future__ import print_function
 
 
-def regex_match(test=None):
-  # [START regex_match]
+def regex_matches(test=None):
+  # [START regex_matches]
   import apache_beam as beam
-  import re
-
-  def parse_plant(text):
-    m = re.match(r'^([^\s-]+)\s*-\s*(\w+)\s*-\s*(?P<duration>\w+)$', text)
-    if m:
-      yield {
-          'match': m.group(0),              # contains the entire matched text
-          'icon': m.group(1),               # ([^\s-]+) - group
-          'name': m.group(2),               # (\w+) - group
-          'duration': m.group('duration'),  # (?P<duration>\w+) - named group
-      }
 
+  # Matches a named group 'icon', and then two comma-separated groups.
+  regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
   with beam.Pipeline() as pipeline:
-    plant_matches = (
+    plants_matches = (
         pipeline
         | 'Garden plants' >> beam.Create([
-            '🍓   -   Strawberry   -   perennial',
-            '🥕 - Carrot - biennial',
-            '# 🍌 - invalid - format',
-            '🍆\t-\tEggplant\t-\tperennial',
-            '🍅 - Tomato - annual',
-            '🍉 - invalid - format with trailing words',
-            '🥔-Potato-perennial',
+            '🍓,   Strawberry,   perennial',
+            '🥕, Carrot, biennial ignoring trailing words',
+            '🍆, Eggplant, perennial',
+            '🍅, Tomato, annual',
+            '🥔,Potato,perennial',
+            '# 🍌, invalid, format',
+            'invalid, 🍉, format',
         ])
-        | 'Parse plants' >> beam.FlatMap(parse_plant)
+        | 'Parse plants' >> beam.Regex.matches(regex)
         | beam.Map(print)
     )
-    # [END regex_match]
+    # [END regex_matches]
     if test:
-      test(plant_matches)
+      test(plants_matches)
 
 
-def regex_search(test=None):
-  # [START regex_search]
+def regex_all_matches(test=None):
+  # [START regex_all_matches]
   import apache_beam as beam
-  import re
-
-  def parse_plant_duration(text):
-    m = re.search(r'([^\s-]+)\s*-\s*(\w*)\s*-\s*(?P<duration>\w+)', text)
-    if m:
-      yield {
-          'match': m.group(0),              # contains the entire matched text
-          'icon': m.group(1),               # ([^\s-]+) - group
-          'name': m.group(2),               # (\w+) - group
-          'duration': m.group('duration'),  # (?P<duration>\w+) - named group
-      }
 
+  # Matches a named group 'icon', and then two comma-separated groups.
+  regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
   with beam.Pipeline() as pipeline:
-    plant_matches = (
+    plants_all_matches = (
         pipeline
         | 'Garden plants' >> beam.Create([
-            '# 🍓   -   Strawberry   -   perennial',
-            '# 🥕 - Carrot - biennial',
-            '# 🍆\t-\tEggplant\t-\tperennial',
-            '# 🍅 - Tomato - annual',
-            '# 🥔-Potato-perennial',
+            '🍓,   Strawberry,   perennial',
+            '🥕, Carrot, biennial ignoring trailing words',
+            '🍆, Eggplant, perennial',
+            '🍅, Tomato, annual',
+            '🥔,Potato,perennial',
+            '# 🍌, invalid, format',
+            'invalid, 🍉, format',
         ])
-        | 'Parse plants' >> beam.FlatMap(parse_plant_duration)
+        | 'Parse plants' >> beam.Regex.all_matches(regex)
         | beam.Map(print)
     )
-    # [END regex_search]
+    # [END regex_all_matches]
     if test:
-      test(plant_matches)
+      test(plants_all_matches)
+
+
+def regex_matches_kv(test=None):
+  # [START regex_matches_kv]
+  import apache_beam as beam
+
+  # Matches a named group 'icon', and then two comma-separated groups.
+  regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
+  with beam.Pipeline() as pipeline:
+    plants_matches_kv = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '🍓,   Strawberry,   perennial',
+            '🥕, Carrot, biennial ignoring trailing words',
+            '🍆, Eggplant, perennial',
+            '🍅, Tomato, annual',
+            '🥔,Potato,perennial',
+            '# 🍌, invalid, format',
+            'invalid, 🍉, format',
+        ])
+        | 'Parse plants' >> beam.Regex.matches_kv(regex, keyGroup='icon')
+        | beam.Map(print)
+    )
+    # [END regex_matches_kv]
+    if test:
+      test(plants_matches_kv)
+
+
+def regex_find(test=None):
+  # [START regex_find]
+  import apache_beam as beam
+
+  # Matches a named group 'icon', and then two comma-separated groups.
+  regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
+  with beam.Pipeline() as pipeline:
+    plants_matches = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '# 🍓,   Strawberry,   perennial',
+            '# 🥕, Carrot, biennial ignoring trailing words',
+            '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
+            '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
+            '# 🥔,Potato,perennial',
+        ])
+        | 'Parse plants' >> beam.Regex.find(regex)
+        | beam.Map(print)
+    )
+    # [END regex_find]
+    if test:
+      test(plants_matches)
 
 
 def regex_find_all(test=None):
   # [START regex_find_all]
   import apache_beam as beam
-  import re
-
-  def parse_words(text):
-    for m in re.finditer(r'[^\s-]+', text):
-      yield m.group()
 
+  # Matches a named group 'icon', and then two comma-separated groups.
+  regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
   with beam.Pipeline() as pipeline:
-    words = (
+    plants_find_all = (
         pipeline
         | 'Garden plants' >> beam.Create([
-            '🍓   -   Strawberry   -   perennial',
-            '🥕 - Carrot - biennial',
-            '🍆\t-\tEggplant\t-\tperennial',
-            '🍅 - Tomato - annual',
-            '🥔-Potato-perennial',
+            '# 🍓,   Strawberry,   perennial',
+            '# 🥕, Carrot, biennial ignoring trailing words',
+            '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
+            '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
+            '# 🥔,Potato,perennial',
         ])
-        | 'Parse words' >> beam.FlatMap(parse_words)
+        | 'Parse plants' >> beam.Regex.find_all(regex)
         | beam.Map(print)
     )
     # [END regex_find_all]
     if test:
-      test(words)
+      test(plants_find_all)
+
+
+def regex_find_kv(test=None):
+  # [START regex_find_kv]
+  import apache_beam as beam
+
+  # Matches a named group 'icon', and then two comma-separated groups.
+  regex = r'(?P<icon>[^ ,]+), *(\w+), *(\w+)'
+  with beam.Pipeline() as pipeline:
+    plants_matches_kv = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '# 🍓,   Strawberry,   perennial',
+            '# 🥕, Carrot, biennial ignoring trailing words',
+            '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
+            '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
+            '# 🥔,Potato,perennial',
+        ])
+        | 'Parse plants' >> beam.Regex.find_kv(regex, keyGroup='icon')
+        | beam.Map(print)
+    )
+    # [END regex_find_kv]
+    if test:
+      test(plants_matches_kv)
+
+
+def regex_replace_all(test=None):
+  # [START regex_replace_all]
+  import apache_beam as beam
+
+  with beam.Pipeline() as pipeline:
+    plants_replace_all = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '🍓  :   Strawberry   :   perennial',
+            '🥕 : Carrot : biennial',
+            '🍆\t\t:\t\tEggplant\t\t:\t\tperennial',
+            '🍅 : Tomato : annual',
+            '🥔:Potato:perennial',
+        ])
+        | 'To CSV' >> beam.Regex.replace_all(r'\s*:\s*', ',')
+        | beam.Map(print)
+    )
+    # [END regex_replace_all]
+    if test:
+      test(plants_replace_all)
 
 
-def regex_replace(test=None):
-  # [START regex_replace]
+def regex_replace_first(test=None):
+  # [START regex_replace_first]
   import apache_beam as beam
-  import re
 
   with beam.Pipeline() as pipeline:
-    plants_csv = (
+    plants_replace_first = (
         pipeline
         | 'Garden plants' >> beam.Create([
-            '🍓   -   Strawberry   -   perennial',
-            '🥕 - Carrot - biennial',
-            '🍆\t-\tEggplant\t-\tperennial',
-            '🍅 - Tomato - annual',
-            '🥔-Potato-perennial',
+            '🍓   ,   Strawberry, perennial',
+            '🥕, Carrot, biennial',
+            '🍆\t\t,\t\tEggplant, perennial',
+            '🍅, Tomato, annual',
+            '🥔,Potato, perennial',
         ])
-        | 'To CSV' >> beam.Map(lambda text: re.sub(r'\s*-\s*', ',', text))
+        | 'As dictionary' >> beam.Regex.replace_first(r'\s*,\s*', ': ')
         | beam.Map(print)
     )
-    # [END regex_replace]
+    # [END regex_replace_first]
     if test:
-      test(plants_csv)
+      test(plants_replace_first)
 
 
 def regex_split(test=None):
   # [START regex_split]
   import apache_beam as beam
-  import re
 
   with beam.Pipeline() as pipeline:
-    plants_columns = (
+    plants_split = (
         pipeline
         | 'Garden plants' >> beam.Create([
-            '🍓   -   Strawberry   -   perennial',
-            '🥕 - Carrot - biennial',
-            '🍆\t-\tEggplant\t-\tperennial',
-            '🍅 - Tomato - annual',
-            '🥔-Potato-perennial',
+            '🍓  :   Strawberry   :   perennial',
+            '🥕 : Carrot : biennial',
+            '🍆\t\t:\t\tEggplant : perennial',
+            '🍅 : Tomato : annual',
+            '🥔:Potato:perennial',
         ])
-        | 'Split' >> beam.Map(lambda text: re.split(r'\s*-\s*', text))
+        | 'Parse plants' >> beam.Regex.split(r'\s*:\s*')
         | beam.Map(print)
     )
     # [END regex_split]
     if test:
-      test(plants_columns)
+      test(plants_split)
diff --git 
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
 
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
index 27c9524..df4fc39 100644
--- 
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
+++ 
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
@@ -23,92 +23,150 @@ import unittest
 
 import mock
 
-from apache_beam.examples.snippets.transforms.element_wise.regex import *
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
 
+from . import regex
+
+
+def check_matches(actual):
+  # [START plants_matches]
+  plants_matches = [
+      '🍓,   Strawberry,   perennial',
+      '🥕, Carrot, biennial',
+      '🍆, Eggplant, perennial',
+      '🍅, Tomato, annual',
+      '🥔,Potato,perennial',
+  ]
+  # [END plants_matches]
+  assert_that(actual, equal_to(plants_matches))
+
+
+def check_all_matches(actual):
+  # [START plants_all_matches]
+  plants_all_matches = [
+      ['🍓,   Strawberry,   perennial', '🍓', 'Strawberry', 'perennial'],
+      ['🥕, Carrot, biennial', '🥕', 'Carrot', 'biennial'],
+      ['🍆, Eggplant, perennial', '🍆', 'Eggplant', 'perennial'],
+      ['🍅, Tomato, annual', '🍅', 'Tomato', 'annual'],
+      ['🥔,Potato,perennial', '🥔', 'Potato', 'perennial'],
+  ]
+  # [END plants_all_matches]
+  assert_that(actual, equal_to(plants_all_matches))
+
+
+def check_matches_kv(actual):
+  # [START plants_matches_kv]
+  plants_matches_kv = [
+      ('🍓', '🍓,   Strawberry,   perennial'),
+      ('🥕', '🥕, Carrot, biennial'),
+      ('🍆', '🍆, Eggplant, perennial'),
+      ('🍅', '🍅, Tomato, annual'),
+      ('🥔', '🥔,Potato,perennial'),
+  ]
+  # [END plants_matches_kv]
+  assert_that(actual, equal_to(plants_matches_kv))
+
+
+def check_find_all(actual):
+  # [START plants_find_all]
+  plants_find_all = [
+      ['🍓,   Strawberry,   perennial'],
+      ['🥕, Carrot, biennial'],
+      ['🍆, Eggplant, perennial', '🍌, Banana, perennial'],
+      ['🍅, Tomato, annual', '🍉, Watermelon, annual'],
+      ['🥔,Potato,perennial'],
+  ]
+  # [END plants_find_all]
+  assert_that(actual, equal_to(plants_find_all))
+
+
+def check_find_kv(actual):
+  # [START plants_find_kv]
+  plants_find_all = [
+      ('🍓', '🍓,   Strawberry,   perennial'),
+      ('🥕', '🥕, Carrot, biennial'),
+      ('🍆', '🍆, Eggplant, perennial'),
+      ('🍌', '🍌, Banana, perennial'),
+      ('🍅', '🍅, Tomato, annual'),
+      ('🍉', '🍉, Watermelon, annual'),
+      ('🥔', '🥔,Potato,perennial'),
+  ]
+  # [END plants_find_kv]
+  assert_that(actual, equal_to(plants_find_all))
+
+
+def check_replace_all(actual):
+  # [START plants_replace_all]
+  plants_replace_all = [
+      '🍓,Strawberry,perennial',
+      '🥕,Carrot,biennial',
+      '🍆,Eggplant,perennial',
+      '🍅,Tomato,annual',
+      '🥔,Potato,perennial',
+  ]
+  # [END plants_replace_all]
+  assert_that(actual, equal_to(plants_replace_all))
+
+
+def check_replace_first(actual):
+  # [START plants_replace_first]
+  plants_replace_first = [
+      '🍓: Strawberry, perennial',
+      '🥕: Carrot, biennial',
+      '🍆: Eggplant, perennial',
+      '🍅: Tomato, annual',
+      '🥔: Potato, perennial',
+  ]
+  # [END plants_replace_first]
+  assert_that(actual, equal_to(plants_replace_first))
+
+
+def check_split(actual):
+  # [START plants_split]
+  plants_split = [
+      ['🍓', 'Strawberry', 'perennial'],
+      ['🥕', 'Carrot', 'biennial'],
+      ['🍆', 'Eggplant', 'perennial'],
+      ['🍅', 'Tomato', 'annual'],
+      ['🥔', 'Potato', 'perennial'],
+  ]
+  # [END plants_replace_first]
+  assert_that(actual, equal_to(plants_split))
+
 
 @mock.patch('apache_beam.Pipeline', TestPipeline)
 # pylint: disable=line-too-long
 
@mock.patch('apache_beam.examples.snippets.transforms.element_wise.regex.print',
 lambda elem: elem)
 # pylint: enable=line-too-long
 class RegexTest(unittest.TestCase):
-  def __init__(self, methodName):
-    super(RegexTest, self).__init__(methodName)
-    # pylint: disable=line-too-long
-    # [START plant_matches]
-    plant_matches = [
-        {'match': '🍓   -   Strawberry   -   perennial', 'icon': '🍓', 'name': 
'Strawberry', 'duration': 'perennial'},
-        {'match': '🥕 - Carrot - biennial', 'icon': '🥕', 'name': 'Carrot', 
'duration': 'biennial'},
-        {'match': '🍆\t-\tEggplant\t-\tperennial', 'icon': '🍆', 'name': 
'Eggplant', 'duration': 'perennial'},
-        {'match': '🍅 - Tomato - annual', 'icon': '🍅', 'name': 'Tomato', 
'duration': 'annual'},
-        {'match': '🥔-Potato-perennial', 'icon': '🥔', 'name': 'Potato', 
'duration': 'perennial'},
-    ]
-    # [END plant_matches]
-    # pylint: enable=line-too-long
-    self.plant_matches_test = lambda actual: \
-        assert_that(actual, equal_to(plant_matches))
-
-    # [START words]
-    words = [
-        '🍓',
-        'Strawberry',
-        'perennial',
-        '🥕',
-        'Carrot',
-        'biennial',
-        '🍆',
-        'Eggplant',
-        'perennial',
-        '🍅',
-        'Tomato',
-        'annual',
-        '🥔',
-        'Potato',
-        'perennial',
-    ]
-    # [END words]
-    self.words_test = lambda actual: assert_that(actual, equal_to(words))
-
-    # [START plants_csv]
-    plants_csv = [
-        '🍓,Strawberry,perennial',
-        '🥕,Carrot,biennial',
-        '🍆,Eggplant,perennial',
-        '🍅,Tomato,annual',
-        '🥔,Potato,perennial',
-    ]
-    # [END plants_csv]
-    self.plants_csv_test = lambda actual: \
-        assert_that(actual, equal_to(plants_csv))
-
-    # [START plants_columns]
-    plants_columns = [
-        ['🍓', 'Strawberry', 'perennial'],
-        ['🥕', 'Carrot', 'biennial'],
-        ['🍆', 'Eggplant', 'perennial'],
-        ['🍅', 'Tomato', 'annual'],
-        ['🥔', 'Potato', 'perennial'],
-    ]
-    # [END plants_columns]
-    self.plants_columns_test = lambda actual: \
-        assert_that(actual, equal_to(plants_columns))
-
-  def test_regex_match(self):
-    regex_match(self.plant_matches_test)
-
-  def test_regex_search(self):
-    regex_search(self.plant_matches_test)
-
-  def test_regex_find_all(self):
-    regex_find_all(self.words_test)
-
-  def test_regex_replace(self):
-    regex_replace(self.plants_csv_test)
-
-  def test_regex_split(self):
-    regex_split(self.plants_columns_test)
+  def test_matches(self):
+    regex.regex_matches(check_matches)
+
+  def test_all_matches(self):
+    regex.regex_all_matches(check_all_matches)
+
+  def test_matches_kv(self):
+    regex.regex_matches_kv(check_matches_kv)
+
+  def test_find(self):
+    regex.regex_find(check_matches)
+
+  def test_find_all(self):
+    regex.regex_find_all(check_find_all)
+
+  def test_find_kv(self):
+    regex.regex_find_kv(check_find_kv)
+
+  def test_replace_all(self):
+    regex.regex_replace_all(check_replace_all)
+
+  def test_replace_first(self):
+    regex.regex_replace_first(check_replace_first)
+
+  def test_split(self):
+    regex.regex_split(check_split)
 
 
 if __name__ == '__main__':

[beam] branch master updated: [BEAM-7389] Update to use util.Regex transform

Reply via email to