[beam] branch master updated: Add Python snippet for Regex transform

altay Tue, 16 Jul 2019 17:49:21 -0700

This is an automated email from the ASF dual-hosted git repository.

altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git



The following commit(s) were added to refs/heads/master by this push:
     new f745702  Add Python snippet for Regex transform
     new db59a3d  Merge pull request #8905 from davidcavazos/element-wise-regex
f745702 is described below

commit f7457028dae67cd94003ef4405dae85c3c21b4c0
Author: David Cavazos <[email protected]>
AuthorDate: Mon Jun 10 16:25:33 2019 -0700

    Add Python snippet for Regex transform
---
 .../snippets/transforms/element_wise/regex.py      | 161 +++++++++++++++++++++
 .../snippets/transforms/element_wise/regex_test.py | 115 +++++++++++++++
 2 files changed, 276 insertions(+)

diff --git 
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py 
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
new file mode 100644
index 0000000..44aa9629
--- /dev/null
+++ b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+
+def regex_match(test=None):
+  # [START regex_match]
+  import apache_beam as beam
+  import re
+
+  def parse_plant(text):
+    m = re.match(r'^([^\s-]+)\s*-\s*(\w+)\s*-\s*(?P<duration>\w+)$', text)
+    if m:
+      yield {
+          'match': m.group(0),              # contains the entire matched text
+          'icon': m.group(1),               # ([^\s-]+) - group
+          'name': m.group(2),               # (\w+) - group
+          'duration': m.group('duration'),  # (?P<duration>\w+) - named group
+      }
+
+  with beam.Pipeline() as pipeline:
+    plant_matches = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '🍓   -   Strawberry   -   perennial',
+            '🥕 - Carrot - biennial',
+            '# 🍌 - invalid - format',
+            '🍆\t-\tEggplant\t-\tperennial',
+            '🍅 - Tomato - annual',
+            '🍉 - invalid - format with trailing words',
+            '🥔-Potato-perennial',
+        ])
+        | 'Parse plants' >> beam.FlatMap(parse_plant)
+        | beam.Map(print)
+    )
+    # [END regex_match]
+    if test:
+      test(plant_matches)
+
+
+def regex_search(test=None):
+  # [START regex_search]
+  import apache_beam as beam
+  import re
+
+  def parse_plant_duration(text):
+    m = re.search(r'([^\s-]+)\s*-\s*(\w*)\s*-\s*(?P<duration>\w+)', text)
+    if m:
+      yield {
+          'match': m.group(0),              # contains the entire matched text
+          'icon': m.group(1),               # ([^\s-]+) - group
+          'name': m.group(2),               # (\w+) - group
+          'duration': m.group('duration'),  # (?P<duration>\w+) - named group
+      }
+
+  with beam.Pipeline() as pipeline:
+    plant_matches = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '# 🍓   -   Strawberry   -   perennial',
+            '# 🥕 - Carrot - biennial',
+            '# 🍆\t-\tEggplant\t-\tperennial',
+            '# 🍅 - Tomato - annual',
+            '# 🥔-Potato-perennial',
+        ])
+        | 'Parse plants' >> beam.FlatMap(parse_plant_duration)
+        | beam.Map(print)
+    )
+    # [END regex_search]
+    if test:
+      test(plant_matches)
+
+
+def regex_find_all(test=None):
+  # [START regex_find_all]
+  import apache_beam as beam
+  import re
+
+  def parse_words(text):
+    for m in re.finditer(r'[^\s-]+', text):
+      yield m.group()
+
+  with beam.Pipeline() as pipeline:
+    words = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '🍓   -   Strawberry   -   perennial',
+            '🥕 - Carrot - biennial',
+            '🍆\t-\tEggplant\t-\tperennial',
+            '🍅 - Tomato - annual',
+            '🥔-Potato-perennial',
+        ])
+        | 'Parse words' >> beam.FlatMap(parse_words)
+        | beam.Map(print)
+    )
+    # [END regex_find_all]
+    if test:
+      test(words)
+
+
+def regex_replace(test=None):
+  # [START regex_replace]
+  import apache_beam as beam
+  import re
+
+  with beam.Pipeline() as pipeline:
+    plants_csv = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '🍓   -   Strawberry   -   perennial',
+            '🥕 - Carrot - biennial',
+            '🍆\t-\tEggplant\t-\tperennial',
+            '🍅 - Tomato - annual',
+            '🥔-Potato-perennial',
+        ])
+        | 'To CSV' >> beam.Map(lambda text: re.sub(r'\s*-\s*', ',', text))
+        | beam.Map(print)
+    )
+    # [END regex_replace]
+    if test:
+      test(plants_csv)
+
+
+def regex_split(test=None):
+  # [START regex_split]
+  import apache_beam as beam
+  import re
+
+  with beam.Pipeline() as pipeline:
+    plants_columns = (
+        pipeline
+        | 'Garden plants' >> beam.Create([
+            '🍓   -   Strawberry   -   perennial',
+            '🥕 - Carrot - biennial',
+            '🍆\t-\tEggplant\t-\tperennial',
+            '🍅 - Tomato - annual',
+            '🥔-Potato-perennial',
+        ])
+        | 'Split' >> beam.Map(lambda text: re.split(r'\s*-\s*', text))
+        | beam.Map(print)
+    )
+    # [END regex_split]
+    if test:
+      test(plants_columns)
diff --git 
a/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
 
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
new file mode 100644
index 0000000..27c9524
--- /dev/null
+++ 
b/sdks/python/apache_beam/examples/snippets/transforms/element_wise/regex_test.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+import unittest
+
+import mock
+
+from apache_beam.examples.snippets.transforms.element_wise.regex import *
+from apache_beam.testing.test_pipeline import TestPipeline
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
+
+
[email protected]('apache_beam.Pipeline', TestPipeline)
+# pylint: disable=line-too-long
[email protected]('apache_beam.examples.snippets.transforms.element_wise.regex.print',
 lambda elem: elem)
+# pylint: enable=line-too-long
+class RegexTest(unittest.TestCase):
+  def __init__(self, methodName):
+    super(RegexTest, self).__init__(methodName)
+    # pylint: disable=line-too-long
+    # [START plant_matches]
+    plant_matches = [
+        {'match': '🍓   -   Strawberry   -   perennial', 'icon': '🍓', 'name': 
'Strawberry', 'duration': 'perennial'},
+        {'match': '🥕 - Carrot - biennial', 'icon': '🥕', 'name': 'Carrot', 
'duration': 'biennial'},
+        {'match': '🍆\t-\tEggplant\t-\tperennial', 'icon': '🍆', 'name': 
'Eggplant', 'duration': 'perennial'},
+        {'match': '🍅 - Tomato - annual', 'icon': '🍅', 'name': 'Tomato', 
'duration': 'annual'},
+        {'match': '🥔-Potato-perennial', 'icon': '🥔', 'name': 'Potato', 
'duration': 'perennial'},
+    ]
+    # [END plant_matches]
+    # pylint: enable=line-too-long
+    self.plant_matches_test = lambda actual: \
+        assert_that(actual, equal_to(plant_matches))
+
+    # [START words]
+    words = [
+        '🍓',
+        'Strawberry',
+        'perennial',
+        '🥕',
+        'Carrot',
+        'biennial',
+        '🍆',
+        'Eggplant',
+        'perennial',
+        '🍅',
+        'Tomato',
+        'annual',
+        '🥔',
+        'Potato',
+        'perennial',
+    ]
+    # [END words]
+    self.words_test = lambda actual: assert_that(actual, equal_to(words))
+
+    # [START plants_csv]
+    plants_csv = [
+        '🍓,Strawberry,perennial',
+        '🥕,Carrot,biennial',
+        '🍆,Eggplant,perennial',
+        '🍅,Tomato,annual',
+        '🥔,Potato,perennial',
+    ]
+    # [END plants_csv]
+    self.plants_csv_test = lambda actual: \
+        assert_that(actual, equal_to(plants_csv))
+
+    # [START plants_columns]
+    plants_columns = [
+        ['🍓', 'Strawberry', 'perennial'],
+        ['🥕', 'Carrot', 'biennial'],
+        ['🍆', 'Eggplant', 'perennial'],
+        ['🍅', 'Tomato', 'annual'],
+        ['🥔', 'Potato', 'perennial'],
+    ]
+    # [END plants_columns]
+    self.plants_columns_test = lambda actual: \
+        assert_that(actual, equal_to(plants_columns))
+
+  def test_regex_match(self):
+    regex_match(self.plant_matches_test)
+
+  def test_regex_search(self):
+    regex_search(self.plant_matches_test)
+
+  def test_regex_find_all(self):
+    regex_find_all(self.words_test)
+
+  def test_regex_replace(self):
+    regex_replace(self.plants_csv_test)
+
+  def test_regex_split(self):
+    regex_split(self.plants_columns_test)
+
+
+if __name__ == '__main__':
+  unittest.main()

[beam] branch master updated: Add Python snippet for Regex transform

Reply via email to