This is an automated email from the ASF dual-hosted git repository.
pabloem pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 958a9302ebb Add hypothesis strategy to generate complex schemas, use
it to test `schemas` module (#23361)
958a9302ebb is described below
commit 958a9302ebb8231054a6d76da9438b8e33a3d474
Author: Brian Hulette <[email protected]>
AuthorDate: Thu Dec 15 09:35:45 2022 -0800
Add hypothesis strategy to generate complex schemas, use it to test
`schemas` module (#23361)
* Add hypothesis
* Test schemas module with hypothesis-generated complex schemas
* Fix schema id handling for nested schemas
* removing duplicated dependency
Co-authored-by: Pablo <[email protected]>
---
.gitignore | 1 +
sdks/python/apache_beam/typehints/row_type.py | 17 +++-
sdks/python/apache_beam/typehints/schemas.py | 1 +
sdks/python/apache_beam/typehints/schemas_test.py | 17 ++++
.../apache_beam/typehints/testing/__init__.py | 18 ++++
.../apache_beam/typehints/testing/strategies.py | 102 +++++++++++++++++++++
sdks/python/setup.py | 2 +-
7 files changed, 156 insertions(+), 2 deletions(-)
diff --git a/.gitignore b/.gitignore
index f5fc63f9de8..73c9e05b4ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@
sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/sampl
**/env/**/*
**/.mypy_cache
**/.dmypy.json
+**/.hypothesis
sdks/python/**/*.c
sdks/python/**/*.so
sdks/python/**/*.egg
diff --git a/sdks/python/apache_beam/typehints/row_type.py
b/sdks/python/apache_beam/typehints/row_type.py
index b02877e4daa..fd7885ad59c 100644
--- a/sdks/python/apache_beam/typehints/row_type.py
+++ b/sdks/python/apache_beam/typehints/row_type.py
@@ -34,6 +34,15 @@ from apache_beam.typehints.schema_registry import
SchemaTypeRegistry
_BEAM_SCHEMA_ID = "_beam_schema_id"
+def _user_type_is_generated(user_type: type) -> bool:
+ if not hasattr(user_type, _BEAM_SCHEMA_ID):
+ return False
+
+ schema_id = getattr(user_type, _BEAM_SCHEMA_ID)
+ type_name = 'BeamSchema_{}'.format(schema_id.replace('-', '_'))
+ return user_type.__name__ == type_name
+
+
class RowTypeConstraint(typehints.TypeConstraint):
def __init__(
self,
@@ -98,6 +107,13 @@ class RowTypeConstraint(typehints.TypeConstraint):
fields = [(name, user_type.__annotations__[name])
for name in user_type._fields]
+ if _user_type_is_generated(user_type):
+ return RowTypeConstraint.from_fields(
+ fields,
+ schema_id=getattr(user_type, _BEAM_SCHEMA_ID),
+ schema_options=schema_options,
+ field_options=field_options)
+
# TODO(https://github.com/apache/beam/issues/22125): Add user API for
# specifying schema/field options
return RowTypeConstraint(
@@ -203,7 +219,6 @@ class GeneratedClassRowTypeConstraint(RowTypeConstraint):
field_options=field_options,
**kwargs)
user_type = named_tuple_from_schema(schema, **kwargs)
- setattr(user_type, _BEAM_SCHEMA_ID, schema_id)
super().__init__(
fields,
diff --git a/sdks/python/apache_beam/typehints/schemas.py
b/sdks/python/apache_beam/typehints/schemas.py
index 55335bbd6b6..dd917a205b4 100644
--- a/sdks/python/apache_beam/typehints/schemas.py
+++ b/sdks/python/apache_beam/typehints/schemas.py
@@ -522,6 +522,7 @@ class SchemaTranslation(object):
user_type,
'__reduce__',
_named_tuple_reduce_method(schema.SerializeToString()))
+ setattr(user_type, row_type._BEAM_SCHEMA_ID, schema.id)
self.schema_registry.add(user_type, schema)
coders.registry.register_coder(user_type, coders.RowCoder)
diff --git a/sdks/python/apache_beam/typehints/schemas_test.py
b/sdks/python/apache_beam/typehints/schemas_test.py
index de2ed829ab3..2495f5722ad 100644
--- a/sdks/python/apache_beam/typehints/schemas_test.py
+++ b/sdks/python/apache_beam/typehints/schemas_test.py
@@ -32,6 +32,8 @@ from typing import Sequence
import cloudpickle
import dill
import numpy as np
+from hypothesis import given
+from hypothesis import settings
from parameterized import parameterized
from parameterized import parameterized_class
@@ -44,6 +46,7 @@ from apache_beam.typehints.schemas import
named_tuple_from_schema
from apache_beam.typehints.schemas import named_tuple_to_schema
from apache_beam.typehints.schemas import typing_from_runner_api
from apache_beam.typehints.schemas import typing_to_runner_api
+from apache_beam.typehints.testing.strategies import named_fields
from apache_beam.utils.timestamp import Timestamp
all_nonoptional_primitives = [
@@ -640,6 +643,20 @@ class SchemaTest(unittest.TestCase):
self.assertEqual(instance, (np.int64(35), 'baz'))
+class HypothesisTest(unittest.TestCase):
+ # There is considerable variablility in runtime for this test, disable
+ # deadline.
+ @settings(deadline=None)
+ @given(named_fields())
+ def test_named_fields_roundtrip(self, named_fields):
+ typehint = row_type.RowTypeConstraint.from_fields(named_fields)
+ roundtripped = typing_from_runner_api(
+ typing_to_runner_api(typehint, schema_registry=SchemaTypeRegistry()),
+ schema_registry=SchemaTypeRegistry())
+
+ self.assertEqual(typehint, roundtripped)
+
+
@parameterized_class([
{
'pickler': pickle,
diff --git a/sdks/python/apache_beam/typehints/testing/__init__.py
b/sdks/python/apache_beam/typehints/testing/__init__.py
new file mode 100644
index 00000000000..20f2809b550
--- /dev/null
+++ b/sdks/python/apache_beam/typehints/testing/__init__.py
@@ -0,0 +1,18 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Testing utilities for apache_beam.typehints."""
diff --git a/sdks/python/apache_beam/typehints/testing/strategies.py
b/sdks/python/apache_beam/typehints/testing/strategies.py
new file mode 100644
index 00000000000..a388e25eea2
--- /dev/null
+++ b/sdks/python/apache_beam/typehints/testing/strategies.py
@@ -0,0 +1,102 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""hypothesis strategies for generating schema types.
+
+Intended for internal use only, no backward-compatibility guarantees."""
+
+import keyword
+import unicodedata
+from typing import Mapping
+from typing import Optional
+from typing import Sequence
+
+from hypothesis import strategies as st
+
+from apache_beam.typehints import row_type
+from apache_beam.typehints.schemas import _PRIMITIVES
+
+PRIMITIVES = [p[0] for p in _PRIMITIVES]
+
+
+def field_names():
+ @st.composite
+ def field_name_candidates(draw):
+ """Strategy to produce valid field names for Beam schema types."""
+ # unicode categories that cannot be used in Python identifiers
+ identifer_denylist = (
+ 'Lo', 'Lm', 'C', 'P', 'Sm', 'Sc', 'So', 'Sk', 'M', 'No', 'Z')
+
+ # First character can't be numeric (Nd).
+ # It also can't start with '_' in a NamedTuple.
+ field_first_character = draw(
+ st.text(
+ alphabet=st.characters(
+ blacklist_categories=('Nd', ) + identifer_denylist,
+ blacklist_characters=('_')),
+ min_size=1,
+ max_size=1))
+ field_remainder = draw(
+ st.text(
+ alphabet=st.characters(blacklist_categories=identifer_denylist)))
+
+ return field_first_character + field_remainder
+
+ return field_name_candidates().filter(
+ lambda s: s.isidentifier() and not keyword.iskeyword(s))
+
+
+def _named_fields_from_types(types):
+ return st.lists(
+ st.tuples(field_names(), types),
+ min_size=1,
+ # Python identifiers are normalized with form NFKC (see
+ # https://peps.python.org/pep-0672/#normalizing-identifiers). We use the
+ # same normalization here to avoid name collisions.
+ unique_by=lambda name_and_type: unicodedata.normalize(
+ 'NFKC', name_and_type[0]),
+ )
+
+
+def types():
+ """Strategy to produce types that are convertible to Beam schema FieldType
+ instances."""
+ def _extend_types(types):
+ optionals = types.map(lambda typ: Optional[typ])
+ sequences = types.map(lambda typ: Sequence[typ])
+ mappings = st.tuples(types,
+ types).map(lambda typs: Mapping[typs[0], typs[1]])
+ rows = _named_fields_from_types(types).map(
+ row_type.RowTypeConstraint.from_fields)
+
+ return st.one_of(optionals, sequences, mappings, rows)
+
+ # TODO: Currently this will only draw from the primitive types that can be
+ # roundtripped faithfully (e.g. np.int64, not int). We should add support for
+ # other types:
+ # - Logical Types (e.g. Timestamp)
+ # - Shunted primitive types (e.g. int)
+ # We'll need to provide support for limiting the types that are drawn. This
+ # could be similar to the allowlist[_categories]/denylist[_categories]
pattern
+ # used in st.characters.
+ return st.recursive(st.sampled_from(PRIMITIVES), _extend_types)
+
+
+def named_fields():
+ """Strategy to produce a set of named fields (type ``List[Tuple[str,
+ type]]``)."""
+ return _named_fields_from_types(types())
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 4e7c0483cc1..f55b66f4005 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -272,7 +272,6 @@ if __name__ == '__main__':
],
'test': [
'freezegun>=0.3.12',
- 'hypothesis<7',
'joblib>=1.0.1',
'mock>=1.0.1,<3.0.0',
'pandas<2.0.0',
@@ -289,6 +288,7 @@ if __name__ == '__main__':
'psycopg2-binary>=2.8.5,<3.0.0',
'testcontainers[mysql]>=3.0.3,<4.0.0',
'cryptography>=36.0.0',
+ 'hypothesis>5.0.0,<=7.0.0',
],
'gcp': [
'cachetools>=3.1.0,<5',