[beam] branch master updated: Add hypothesis strategy to generate complex schemas, use it to test `schemas` module (#23361)

pabloem Thu, 15 Dec 2022 09:36:05 -0800

This is an automated email from the ASF dual-hosted git repository.

pabloem pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git



The following commit(s) were added to refs/heads/master by this push:
     new 958a9302ebb Add hypothesis strategy to generate complex schemas, use 
it to test `schemas` module (#23361)
958a9302ebb is described below

commit 958a9302ebb8231054a6d76da9438b8e33a3d474
Author: Brian Hulette <[email protected]>
AuthorDate: Thu Dec 15 09:35:45 2022 -0800

    Add hypothesis strategy to generate complex schemas, use it to test 
`schemas` module (#23361)
    
    * Add hypothesis
    
    * Test schemas module with hypothesis-generated complex schemas
    
    * Fix schema id handling for nested schemas
    
    * removing duplicated dependency
    
    Co-authored-by: Pablo <[email protected]>
---
 .gitignore                                         |   1 +
 sdks/python/apache_beam/typehints/row_type.py      |  17 +++-
 sdks/python/apache_beam/typehints/schemas.py       |   1 +
 sdks/python/apache_beam/typehints/schemas_test.py  |  17 ++++
 .../apache_beam/typehints/testing/__init__.py      |  18 ++++
 .../apache_beam/typehints/testing/strategies.py    | 102 +++++++++++++++++++++
 sdks/python/setup.py                               |   2 +-
 7 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index f5fc63f9de8..73c9e05b4ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ 
sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/sampl
 **/env/**/*
 **/.mypy_cache
 **/.dmypy.json
+**/.hypothesis
 sdks/python/**/*.c
 sdks/python/**/*.so
 sdks/python/**/*.egg
diff --git a/sdks/python/apache_beam/typehints/row_type.py 
b/sdks/python/apache_beam/typehints/row_type.py
index b02877e4daa..fd7885ad59c 100644
--- a/sdks/python/apache_beam/typehints/row_type.py
+++ b/sdks/python/apache_beam/typehints/row_type.py
@@ -34,6 +34,15 @@ from apache_beam.typehints.schema_registry import 
SchemaTypeRegistry
 _BEAM_SCHEMA_ID = "_beam_schema_id"
 
 
+def _user_type_is_generated(user_type: type) -> bool:
+  if not hasattr(user_type, _BEAM_SCHEMA_ID):
+    return False
+
+  schema_id = getattr(user_type, _BEAM_SCHEMA_ID)
+  type_name = 'BeamSchema_{}'.format(schema_id.replace('-', '_'))
+  return user_type.__name__ == type_name
+
+
 class RowTypeConstraint(typehints.TypeConstraint):
   def __init__(
       self,
@@ -98,6 +107,13 @@ class RowTypeConstraint(typehints.TypeConstraint):
       fields = [(name, user_type.__annotations__[name])
                 for name in user_type._fields]
 
+      if _user_type_is_generated(user_type):
+        return RowTypeConstraint.from_fields(
+            fields,
+            schema_id=getattr(user_type, _BEAM_SCHEMA_ID),
+            schema_options=schema_options,
+            field_options=field_options)
+
       # TODO(https://github.com/apache/beam/issues/22125): Add user API for
       # specifying schema/field options
       return RowTypeConstraint(
@@ -203,7 +219,6 @@ class GeneratedClassRowTypeConstraint(RowTypeConstraint):
         field_options=field_options,
         **kwargs)
     user_type = named_tuple_from_schema(schema, **kwargs)
-    setattr(user_type, _BEAM_SCHEMA_ID, schema_id)
 
     super().__init__(
         fields,
diff --git a/sdks/python/apache_beam/typehints/schemas.py 
b/sdks/python/apache_beam/typehints/schemas.py
index 55335bbd6b6..dd917a205b4 100644
--- a/sdks/python/apache_beam/typehints/schemas.py
+++ b/sdks/python/apache_beam/typehints/schemas.py
@@ -522,6 +522,7 @@ class SchemaTranslation(object):
         user_type,
         '__reduce__',
         _named_tuple_reduce_method(schema.SerializeToString()))
+    setattr(user_type, row_type._BEAM_SCHEMA_ID, schema.id)
 
     self.schema_registry.add(user_type, schema)
     coders.registry.register_coder(user_type, coders.RowCoder)
diff --git a/sdks/python/apache_beam/typehints/schemas_test.py 
b/sdks/python/apache_beam/typehints/schemas_test.py
index de2ed829ab3..2495f5722ad 100644
--- a/sdks/python/apache_beam/typehints/schemas_test.py
+++ b/sdks/python/apache_beam/typehints/schemas_test.py
@@ -32,6 +32,8 @@ from typing import Sequence
 import cloudpickle
 import dill
 import numpy as np
+from hypothesis import given
+from hypothesis import settings
 from parameterized import parameterized
 from parameterized import parameterized_class
 
@@ -44,6 +46,7 @@ from apache_beam.typehints.schemas import 
named_tuple_from_schema
 from apache_beam.typehints.schemas import named_tuple_to_schema
 from apache_beam.typehints.schemas import typing_from_runner_api
 from apache_beam.typehints.schemas import typing_to_runner_api
+from apache_beam.typehints.testing.strategies import named_fields
 from apache_beam.utils.timestamp import Timestamp
 
 all_nonoptional_primitives = [
@@ -640,6 +643,20 @@ class SchemaTest(unittest.TestCase):
     self.assertEqual(instance, (np.int64(35), 'baz'))
 
 
+class HypothesisTest(unittest.TestCase):
+  # There is considerable variablility in runtime for this test, disable
+  # deadline.
+  @settings(deadline=None)
+  @given(named_fields())
+  def test_named_fields_roundtrip(self, named_fields):
+    typehint = row_type.RowTypeConstraint.from_fields(named_fields)
+    roundtripped = typing_from_runner_api(
+        typing_to_runner_api(typehint, schema_registry=SchemaTypeRegistry()),
+        schema_registry=SchemaTypeRegistry())
+
+    self.assertEqual(typehint, roundtripped)
+
+
 @parameterized_class([
     {
         'pickler': pickle,
diff --git a/sdks/python/apache_beam/typehints/testing/__init__.py 
b/sdks/python/apache_beam/typehints/testing/__init__.py
new file mode 100644
index 00000000000..20f2809b550
--- /dev/null
+++ b/sdks/python/apache_beam/typehints/testing/__init__.py
@@ -0,0 +1,18 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Testing utilities for apache_beam.typehints."""
diff --git a/sdks/python/apache_beam/typehints/testing/strategies.py 
b/sdks/python/apache_beam/typehints/testing/strategies.py
new file mode 100644
index 00000000000..a388e25eea2
--- /dev/null
+++ b/sdks/python/apache_beam/typehints/testing/strategies.py
@@ -0,0 +1,102 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""hypothesis strategies for generating schema types.
+
+Intended for internal use only, no backward-compatibility guarantees."""
+
+import keyword
+import unicodedata
+from typing import Mapping
+from typing import Optional
+from typing import Sequence
+
+from hypothesis import strategies as st
+
+from apache_beam.typehints import row_type
+from apache_beam.typehints.schemas import _PRIMITIVES
+
+PRIMITIVES = [p[0] for p in _PRIMITIVES]
+
+
+def field_names():
+  @st.composite
+  def field_name_candidates(draw):
+    """Strategy to produce valid field names for Beam schema types."""
+    # unicode categories that cannot be used in Python identifiers
+    identifer_denylist = (
+        'Lo', 'Lm', 'C', 'P', 'Sm', 'Sc', 'So', 'Sk', 'M', 'No', 'Z')
+
+    # First character can't be numeric (Nd).
+    # It also can't start with '_' in a NamedTuple.
+    field_first_character = draw(
+        st.text(
+            alphabet=st.characters(
+                blacklist_categories=('Nd', ) + identifer_denylist,
+                blacklist_characters=('_')),
+            min_size=1,
+            max_size=1))
+    field_remainder = draw(
+        st.text(
+            alphabet=st.characters(blacklist_categories=identifer_denylist)))
+
+    return field_first_character + field_remainder
+
+  return field_name_candidates().filter(
+      lambda s: s.isidentifier() and not keyword.iskeyword(s))
+
+
+def _named_fields_from_types(types):
+  return st.lists(
+      st.tuples(field_names(), types),
+      min_size=1,
+      # Python identifiers are normalized with form NFKC (see
+      # https://peps.python.org/pep-0672/#normalizing-identifiers). We use the
+      # same normalization here to avoid name collisions.
+      unique_by=lambda name_and_type: unicodedata.normalize(
+          'NFKC', name_and_type[0]),
+  )
+
+
+def types():
+  """Strategy to produce types that are convertible to Beam schema FieldType
+  instances."""
+  def _extend_types(types):
+    optionals = types.map(lambda typ: Optional[typ])
+    sequences = types.map(lambda typ: Sequence[typ])
+    mappings = st.tuples(types,
+                         types).map(lambda typs: Mapping[typs[0], typs[1]])
+    rows = _named_fields_from_types(types).map(
+        row_type.RowTypeConstraint.from_fields)
+
+    return st.one_of(optionals, sequences, mappings, rows)
+
+  # TODO: Currently this will only draw from the primitive types that can be
+  # roundtripped faithfully (e.g. np.int64, not int). We should add support for
+  # other types:
+  # - Logical Types (e.g. Timestamp)
+  # - Shunted primitive types (e.g. int)
+  # We'll need to provide support for limiting the types that are drawn. This
+  # could be similar to the allowlist[_categories]/denylist[_categories] 
pattern
+  # used in st.characters.
+  return st.recursive(st.sampled_from(PRIMITIVES), _extend_types)
+
+
+def named_fields():
+  """Strategy to produce a set of named fields (type ``List[Tuple[str,
+  type]]``)."""
+  return _named_fields_from_types(types())
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 4e7c0483cc1..f55b66f4005 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -272,7 +272,6 @@ if __name__ == '__main__':
           ],
           'test': [
             'freezegun>=0.3.12',
-            'hypothesis<7',
             'joblib>=1.0.1',
             'mock>=1.0.1,<3.0.0',
             'pandas<2.0.0',
@@ -289,6 +288,7 @@ if __name__ == '__main__':
             'psycopg2-binary>=2.8.5,<3.0.0',
             'testcontainers[mysql]>=3.0.3,<4.0.0',
             'cryptography>=36.0.0',
+            'hypothesis>5.0.0,<=7.0.0',
           ],
           'gcp': [
             'cachetools>=3.1.0,<5',

[beam] branch master updated: Add hypothesis strategy to generate complex schemas, use it to test `schemas` module (#23361)

Reply via email to