[avro] branch master updated: AVRO-1938: Generate Parsing Canonical Forms of Schema (#1167)

kojiromike Mon, 05 Apr 2021 06:42:08 -0700

This is an automated email from the ASF dual-hosted git repository.

kojiromike pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git



The following commit(s) were added to refs/heads/master by this push:
     new c769c43  AVRO-1938: Generate Parsing Canonical Forms of Schema (#1167)
c769c43 is described below

commit c769c4343e24d2473417afe46b37a1b60d7ca84f
Author: Subhash Bhushan <[email protected]>
AuthorDate: Mon Apr 5 06:41:47 2021 -0700

    AVRO-1938: Generate Parsing Canonical Forms of Schema (#1167)
    
    * AVRO-1938: Generate Parsing Canonical Forms of Schema
    
    This PR adds support for generating Parsing Canonical Forms of Avro Schemas
    to the main `avro` package.
    
    The bulk of work was done by @kojiromike and @forsberg. This PR cleans up 
code
    where necessary, adds more test cases, and clarifies on transformations 
where
    not applicable in Python (ex. Transformation of integers with leading zeros)
    
    Closes: https://issues.apache.org/jira/browse/AVRO-1938
    
    * AVRO-1938 Fix typo of repeating test method name
    
    * AVRO-1938: Makes `names` argument to `to_canonical_json` optional
    
    This commit ensures that `names` is an optional parameter in all subclass
    implementations of `to_canonical_json`. It also addresses minor formatting
    issues and review comments to previous commits.
    
    * AVRO-1938 Use `dict.get` to return value in dict or default
---
 lang/py/avro/protocol.py         |   6 +-
 lang/py/avro/schema.py           | 197 ++++++++++++++----
 lang/py/avro/test/test_schema.py | 419 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 580 insertions(+), 42 deletions(-)

diff --git a/lang/py/avro/protocol.py b/lang/py/avro/protocol.py
index 483f63e..afe6d63 100644
--- a/lang/py/avro/protocol.py
+++ b/lang/py/avro/protocol.py
@@ -209,13 +209,14 @@ class Message:
         return json.dumps(self.to_json())
 
     def to_json(self, names=None):
-        if names is None:
-            names = avro.schema.Names()
+        names = names or avro.schema.Names()
+
         to_dump = {}
         to_dump['request'] = self.request.to_json(names)
         to_dump['response'] = self.response.to_json(names)
         if self.errors:
             to_dump['errors'] = self.errors.to_json(names)
+
         return to_dump
 
     def __eq__(self, that):
@@ -231,6 +232,7 @@ def make_avpr_object(json_data):
         messages = json_data.get('messages')
     except AttributeError:
         raise avro.errors.ProtocolParseException('Not a JSON object: %s' % 
json_data)
+
     return Protocol(name, namespace, types, messages)
 
 
diff --git a/lang/py/avro/schema.py b/lang/py/avro/schema.py
index 2795268..e4021fa 100644
--- a/lang/py/avro/schema.py
+++ b/lang/py/avro/schema.py
@@ -41,6 +41,7 @@ A schema may be one of:
 """
 
 import abc
+import collections
 import datetime
 import decimal
 import json
@@ -113,6 +114,16 @@ VALID_FIELD_SORT_ORDERS = (
     'ignore',
 )
 
+CANONICAL_FIELD_ORDER = (
+    'name',
+    'type',
+    'fields',
+    'symbols',
+    'items',
+    'values',
+    'size',
+)
+
 INT_MIN_VALUE = -(1 << 31)
 INT_MAX_VALUE = (1 << 31) - 1
 LONG_MIN_VALUE = -(1 << 63)
@@ -136,7 +147,18 @@ def _is_timezone_aware_datetime(dt):
 # Base Classes
 #
 
-class Schema(abc.ABC):
+class CanonicalPropertiesMixin(object):
+    """A Mixin that provides canonical properties to Schema and Field types."""
+    @property
+    def canonical_properties(self):
+        props = self.props
+        return collections.OrderedDict(
+            (key, props[key])
+            for key in CANONICAL_FIELD_ORDER
+            if key in props)
+
+
+class Schema(abc.ABC, CanonicalPropertiesMixin):
     """Base class for all Schema classes."""
     _props = None
 
@@ -202,9 +224,12 @@ class Schema(abc.ABC):
         in the parameter names.
         """
 
+    @abc.abstractmethod
     def validate(self, datum):
         """Returns the appropriate schema object if datum is valid for that 
schema, else None.
 
+        To be implemented in subclasses.
+
         Validation concerns only shape and type of data in the top level of 
the current schema.
         In most cases, the returned schema object will be self. However, for 
UnionSchema objects,
         the returned Schema will be the first branch schema for which 
validation passes.
@@ -212,7 +237,20 @@ class Schema(abc.ABC):
         @arg datum: The data to be checked for validity according to this 
schema
         @return Optional[Schema]
         """
-        raise Exception("Must be implemented by subclasses.")
+
+    @abc.abstractmethod
+    def to_canonical_json(self, names=None):
+        """
+        Converts the schema object into its Canonical Form
+        
http://avro.apache.org/docs/current/spec.html#Parsing+Canonical+Form+for+Schemas
+
+        To be implemented in subclasses.
+        """
+
+    @property
+    def canonical_form(self):
+        # The separators eliminate whitespace around commas and colons.
+        return json.dumps(self.to_canonical_json(), separators=(",", ":"))
 
 
 class Name:
@@ -223,22 +261,24 @@ class Name:
     def __init__(self, name_attr, space_attr, default_space):
         """The fullname is determined in one of the following ways:
 
-        - A name and namespace are both specified. For example, one might use 
"name": "X", "namespace": "org.foo" to indicate the fullname org.foo.X.
+        - A name and namespace are both specified. For example, one might use 
"name": "X",
+            "namespace": "org.foo" to indicate the fullname org.foo.X.
         - A fullname is specified. If the name specified contains a dot,
-          then it is assumed to be a fullname, and any namespace also 
specified is ignored.
-          For example, use "name": "org.foo.X" to indicate the fullname 
org.foo.X.
+            then it is assumed to be a fullname, and any namespace also 
specified is ignored.
+            For example, use "name": "org.foo.X" to indicate the fullname 
org.foo.X.
         - A name only is specified, i.e., a name that contains no dots.
-          In this case the namespace is taken from the most tightly enclosing 
schema or protocol.
-          For example, if "name": "X" is specified, and this occurs within a 
field of
-          the record definition of org.foo.Y, then the fullname is org.foo.X.
-          If there is no enclosing namespace then the null namespace is used.
+            In this case the namespace is taken from the most tightly 
enclosing schema or protocol.
+            For example, if "name": "X" is specified, and this occurs within a 
field of
+            the record definition of org.foo.Y, then the fullname is org.foo.X.
+            If there is no enclosing namespace then the null namespace is used.
 
         References to previously defined names are as in the latter two cases 
above:
         if they contain a dot they are a fullname,
         if they do not contain a dot, the namespace is the namespace of the 
enclosing definition.
 
         @arg name_attr: name value read in schema or None.
-        @arg space_attr: namespace value read in schema or None. The empty 
string may be used as a namespace to indicate the null namespace.
+        @arg space_attr: namespace value read in schema or None. The empty 
string may be used as a namespace
+            to indicate the null namespace.
         @arg default_space: the current default space or None.
         """
         if name_attr is None:
@@ -291,9 +331,7 @@ class Names:
 
     def get_name(self, name_attr, space_attr):
         test = Name(name_attr, space_attr, self.default_namespace).fullname
-        if test not in self.names:
-            return None
-        return self.names[test]
+        return self.names.get(test)
 
     def prune_namespace(self, properties):
         """given a properties, return properties with namespace removed if
@@ -301,12 +339,15 @@ class Names:
         if self.default_namespace is None:
             # I have no default -- no change
             return properties
+
         if 'namespace' not in properties:
             # he has no namespace - no change
             return properties
+
         if properties['namespace'] != self.default_namespace:
             # we're different - leave his stuff alone
             return properties
+
         # we each have a namespace and it's redundant. delete his.
         prunable = properties.copy()
         del(prunable['namespace'])
@@ -316,10 +357,10 @@ class Names:
         """
         Add a new schema object to the name set.
 
-          @arg name_attr: name value read in schema
-          @arg space_attr: namespace value read in schema.
+        @arg name_attr: name value read in schema
+        @arg space_attr: namespace value read in schema.
 
-          @return: the Name that was just added.
+        @return: the Name that was just added.
         """
         to_add = Name(name_attr, space_attr, self.default_namespace)
 
@@ -407,7 +448,7 @@ class DecimalLogicalSchema(LogicalSchema):
         super(DecimalLogicalSchema, self).__init__('decimal')
 
 
-class Field:
+class Field(CanonicalPropertiesMixin):
     def __init__(self, type, name, has_default, default=None,
                  order=None, names=None, doc=None, other_props=None):
         # Ensure valid ctor args
@@ -469,10 +510,19 @@ class Field:
         return json.dumps(self.to_json())
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         to_dump = self.props.copy()
         to_dump['type'] = self.type.to_json(names)
+
+        return to_dump
+
+    def to_canonical_json(self, names=None):
+        names = names or Names()
+
+        to_dump = self.canonical_properties
+        to_dump["type"] = self.type.to_canonical_json(names)
+
         return to_dump
 
     def __eq__(self, that):
@@ -526,6 +576,9 @@ class PrimitiveSchema(Schema):
         else:
             return self.props
 
+    def to_canonical_json(self, names=None):
+        return self.fullname if len(self.props) == 1 else 
self.canonical_properties
+
     def validate(self, datum):
         """Return self if datum is a valid representation of this type of 
primitive schema, else None
 
@@ -593,13 +646,19 @@ class FixedSchema(NamedSchema):
         return self.type == writer.type and self.check_props(writer, 
['fullname', 'size'])
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         if self.fullname in names.names:
             return self.name_ref(names)
-        else:
-            names.names[self.fullname] = self
-            return names.prune_namespace(self.props)
+
+        names.names[self.fullname] = self
+        return names.prune_namespace(self.props)
+
+    def to_canonical_json(self, names=None):
+        to_dump = self.canonical_properties
+        to_dump["name"] = self.fullname
+
+        return to_dump
 
     def validate(self, datum):
         """Return self if datum is a valid representation of this schema, else 
None."""
@@ -673,13 +732,24 @@ class EnumSchema(NamedSchema):
         return self.type == writer.type and self.check_props(writer, 
['fullname'])
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         if self.fullname in names.names:
             return self.name_ref(names)
+
+        names.names[self.fullname] = self
+        return names.prune_namespace(self.props)
+
+    def to_canonical_json(self, names=None):
+        names_as_json = self.to_json(names)
+
+        if isinstance(names_as_json, str):
+            to_dump = self.fullname
         else:
-            names.names[self.fullname] = self
-            return names.prune_namespace(self.props)
+            to_dump = self.canonical_properties
+            to_dump["name"] = self.fullname
+
+        return to_dump
 
     def validate(self, datum):
         """Return self if datum is a valid member of this Enum, else None."""
@@ -722,11 +792,21 @@ class ArraySchema(Schema):
         return self.type == writer.type and 
self.items.check_props(writer.items, ['type'])
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         to_dump = self.props.copy()
         item_schema = self.get_prop('items')
         to_dump['items'] = item_schema.to_json(names)
+
+        return to_dump
+
+    def to_canonical_json(self, names=None):
+        names = names or Names()
+
+        to_dump = self.canonical_properties
+        item_schema = self.get_prop("items")
+        to_dump["items"] = item_schema.to_canonical_json(names)
+
         return to_dump
 
     def validate(self, datum):
@@ -768,10 +848,19 @@ class MapSchema(Schema):
         return writer.type == self.type and 
self.values.check_props(writer.values, ['type'])
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         to_dump = self.props.copy()
         to_dump['values'] = self.get_prop('values').to_json(names)
+
+        return to_dump
+
+    def to_canonical_json(self, names=None):
+        names = names or Names()
+
+        to_dump = self.canonical_properties
+        to_dump["values"] = self.get_prop("values").to_canonical_json(names)
+
         return to_dump
 
     def validate(self, datum):
@@ -829,13 +918,19 @@ class UnionSchema(Schema):
         return writer.type in {'union', 'error_union'} or any(s.match(writer) 
for s in self.schemas)
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         to_dump = []
         for schema in self.schemas:
             to_dump.append(schema.to_json(names))
+
         return to_dump
 
+    def to_canonical_json(self, names=None):
+        names = names or Names()
+
+        return [schema.to_canonical_json(names) for schema in self.schemas]
+
     def validate(self, datum):
         """Return the first branch schema of which datum is a valid example, 
else None."""
         for branch in self.schemas:
@@ -853,14 +948,15 @@ class ErrorUnionSchema(UnionSchema):
         UnionSchema.__init__(self, ['string'] + schemas, names)
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         to_dump = []
         for schema in self.schemas:
             # Don't print the system error schema
             if schema.type == 'string':
                 continue
             to_dump.append(schema.to_json(names))
+
         return to_dump
 
 
@@ -948,8 +1044,8 @@ class RecordSchema(NamedSchema):
         return fields_dict
 
     def to_json(self, names=None):
-        if names is None:
-            names = Names()
+        names = names or Names()
+
         # Request records don't have names
         if self.type == 'request':
             return [f.to_json(names) for f in self.fields]
@@ -961,6 +1057,24 @@ class RecordSchema(NamedSchema):
 
         to_dump = names.prune_namespace(self.props.copy())
         to_dump['fields'] = [f.to_json(names) for f in self.fields]
+
+        return to_dump
+
+    def to_canonical_json(self, names=None):
+        names = names or Names()
+
+        if self.type == 'request':
+            raise NotImplementedError("Canonical form (probably) does not make 
sense on type request")
+
+        to_dump = self.canonical_properties
+        to_dump["name"] = self.fullname
+
+        if self.fullname in names.names:
+            return self.name_ref(names)
+
+        names.names[self.fullname] = self
+        to_dump["fields"] = [f.to_canonical_json(names) for f in self.fields]
+
         return to_dump
 
     def validate(self, datum):
@@ -1124,18 +1238,19 @@ def make_avsc_object(json_data, names=None, 
validate_enum_symbols=True):
     @arg names: A Names object (tracks seen names and default space)
     @arg validate_enum_symbols: If False, will allow enum symbols that are not 
valid Avro names.
     """
-    if names is None:
-        names = Names()
+    names = names or Names()
 
     # JSON object (non-union)
     if callable(getattr(json_data, 'get', None)):
         type = json_data.get('type')
         other_props = get_other_props(json_data, SCHEMA_RESERVED_PROPS)
         logical_type = json_data.get('logicalType')
+
         if logical_type:
             logical_schema = make_logical_schema(logical_type, type, 
other_props or {})
             if logical_schema is not None:
                 return logical_schema
+
         if type in NAMED_TYPES:
             name = json_data.get('name')
             namespace = json_data.get('namespace', names.default_namespace)
@@ -1159,8 +1274,10 @@ def make_avsc_object(json_data, names=None, 
validate_enum_symbols=True):
                 return RecordSchema(name, namespace, fields, names, type, doc, 
other_props)
             else:
                 raise avro.errors.SchemaParseException('Unknown Named Type: 
%s' % type)
+
         if type in PRIMITIVE_TYPES:
             return PrimitiveSchema(type, other_props)
+
         if type in VALID_TYPES:
             if type == 'array':
                 items = json_data.get('items')
diff --git a/lang/py/avro/test/test_schema.py b/lang/py/avro/test/test_schema.py
index 7ce5a1c..fceb973 100644
--- a/lang/py/avro/test/test_schema.py
+++ b/lang/py/avro/test/test_schema.py
@@ -105,6 +105,28 @@ UNION_EXAMPLES = [
                        {"type": "array", "items": "string"}]),
 ]
 
+NAMED_IN_UNION_EXAMPLES = [
+  ValidTestSchema({
+    "namespace": "org.apache.avro.test",
+    "type": "record",
+    "name": "Test",
+    "fields": [
+        {
+            "type": {
+                "symbols": ["one", "two"],
+                "type": "enum",
+                "name": "NamedEnum"
+                },
+            "name": "thenamedenum"
+        },
+        {
+            "type": ["null", "NamedEnum"],
+            "name": "unionwithreftoenum"
+        }
+    ]
+    })
+]
+
 RECORD_EXAMPLES = [
     ValidTestSchema({"type": "record", "name": "Test", "fields": [{"name": 
"f", "type": "long"}]}),
     ValidTestSchema({"type": "error", "name": "Test", "fields": [{"name": "f", 
"type": "long"}]}),
@@ -295,6 +317,7 @@ EXAMPLES += ENUM_EXAMPLES
 EXAMPLES += ARRAY_EXAMPLES
 EXAMPLES += MAP_EXAMPLES
 EXAMPLES += UNION_EXAMPLES
+EXAMPLES += NAMED_IN_UNION_EXAMPLES
 EXAMPLES += RECORD_EXAMPLES
 EXAMPLES += DOC_EXAMPLES
 EXAMPLES += DECIMAL_LOGICAL_TYPE
@@ -583,6 +606,401 @@ class OtherAttributesTestCase(unittest.TestCase):
                 self._check_props(p)
 
 
+class CanonicalFormTestCase(unittest.TestCase):
+    r"""Enable generating canonical-form test cases over the valid schema.
+        Transforming into Parsing Canonical Form
+        Assuming an input schema (in JSON form) that's already UTF-8 text for 
a valid Avro schema (including all
+        quotes as required by JSON), the following transformations will 
produce its Parsing Canonical Form:
+            - [PRIMITIVES] Convert primitive schemas to their simple form 
(e.g., int instead of {"type":"int"}).
+            - [FULLNAMES] Replace short names with fullnames, using applicable 
namespaces to do so. Then eliminate
+                namespace attributes, which are now redundant.
+            - [STRIP] Keep only attributes that are relevant to parsing data, 
which are: type, name, fields, symbols,
+                items, values, size. Strip all others (e.g., doc and aliases).
+            - [ORDER] Order the appearance of fields of JSON objects as 
follows: name, type, fields, symbols, items,
+                values, size. For example, if an object has type, name, and 
size fields, then the name field should
+                appear first, followed by the type and then the size fields.
+            - [STRINGS] For all JSON string literals in the schema text, 
replace any escaped characters
+                (e.g., \uXXXX escapes) with their UTF-8 equivalents.
+            - [INTEGERS] Eliminate quotes around and any leading zeros in 
front of JSON integer literals
+                (which appear in the size attributes of fixed schemas).
+            - [WHITESPACE] Eliminate all whitespace in JSON outside of string 
literals.
+        We depend on the Python json parser to properly handle the STRINGS and 
INTEGERS rules, so
+        we don't test them here.
+    """
+
+    def compact_json_string(self, json_doc):
+        """Returns compact-encoded JSON string representation for supplied 
document.
+
+        Args:
+            json_doc (json): JSON Document
+
+        Returns:
+            str: Compact-encoded, stringified JSON document
+        """
+        return json.dumps(json_doc, separators=(',', ':'))
+
+    def test_primitive_int(self):
+        """
+        Convert primitive schemas to their simple form (e.g., int instead of 
{"type":"int"}).
+        """
+        s = avro.schema.parse(json.dumps('int'))
+        self.assertEqual(s.canonical_form, '"int"')
+
+        s = avro.schema.parse(json.dumps({"type": "int"}))
+        self.assertEqual(s.canonical_form, '"int"')
+
+    def test_primitive_float(self):
+        s = avro.schema.parse(json.dumps('float'))
+        self.assertEqual(s.canonical_form, '"float"')
+
+        s = avro.schema.parse(json.dumps({"type": "float"}))
+        self.assertEqual(s.canonical_form, '"float"')
+
+    def test_primitive_double(self):
+        s = avro.schema.parse(json.dumps('double'))
+        self.assertEqual(s.canonical_form, '"double"')
+
+        s = avro.schema.parse(json.dumps({"type": "double"}))
+        self.assertEqual(s.canonical_form, '"double"')
+
+    def test_primitive_null(self):
+        s = avro.schema.parse(json.dumps('null'))
+        self.assertEqual(s.canonical_form, '"null"')
+
+        s = avro.schema.parse(json.dumps({"type": "null"}))
+        self.assertEqual(s.canonical_form, '"null"')
+
+    def test_primitive_bytes(self):
+        s = avro.schema.parse(json.dumps('bytes'))
+        self.assertEqual(s.canonical_form, '"bytes"')
+
+        s = avro.schema.parse(json.dumps({"type": "bytes"}))
+        self.assertEqual(s.canonical_form, '"bytes"')
+
+    def test_primitive_long(self):
+        s = avro.schema.parse(json.dumps('long'))
+        self.assertEqual(s.canonical_form, '"long"')
+
+        s = avro.schema.parse(json.dumps({"type": "long"}))
+        self.assertEqual(s.canonical_form, '"long"')
+
+    def test_primitive_boolean(self):
+        s = avro.schema.parse(json.dumps('boolean'))
+        self.assertEqual(s.canonical_form, '"boolean"')
+
+        s = avro.schema.parse(json.dumps({"type": "boolean"}))
+        self.assertEqual(s.canonical_form, '"boolean"')
+
+    def test_primitive_string(self):
+        s = avro.schema.parse(json.dumps('string'))
+        self.assertEqual(s.canonical_form, '"string"')
+
+        s = avro.schema.parse(json.dumps({"type": "string"}))
+        self.assertEqual(s.canonical_form, '"string"')
+
+    def test_integer_canonical_form(self):
+        """
+        Integer literals starting with 0 are illegal in python, because of 
ambiguity. This is a placeholder test
+        for INTEGERS canonical form, which should generally succeed provided a 
valid integer has been supplied.
+        """
+        s = avro.schema.parse('{"name":"md5","type":"fixed","size":16}')
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "name": "md5",
+                "type": "fixed",
+                "size": 16}))
+
+    def test_string_with_escaped_characters(self):
+        """
+        Replace any escaped characters (e.g., \u0031 escapes) with their UTF-8 
equivalents.
+        """
+        s = avro.schema.parse('{"name":"\u0041","type":"fixed","size":16}')
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "name": "A",
+                "type": "fixed",
+                "size": 16}))
+
+    def test_fullname(self):
+        """
+        Replace short names with fullnames, using applicable namespaces to do 
so. Then eliminate namespace attributes, which are now redundant.
+        """
+        s = avro.schema.parse(json.dumps({
+            "namespace": "avro",
+            "name": "example",
+            "type": "enum",
+            "symbols": ["a", "b"]}))
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "name": "avro.example",
+                "type": "enum",
+                "symbols": ["a", "b"]}))
+
+    def test_strip(self):
+        """
+        Keep only attributes that are relevant to parsing data, which are: 
type, name, fields, symbols, items, values,
+        size. Strip all others (e.g., doc and aliases).
+        """
+        s = avro.schema.parse(json.dumps({
+            "name": "foo",
+            "type": "enum",
+            "doc": "test",
+            "aliases": ["bar"],
+            "symbols": ["a", "b"]}))
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "name": "foo",
+                "type": "enum",
+                "symbols": ["a", "b"]}))
+
+    def test_order(self):
+        """
+        Order the appearance of fields of JSON objects as follows: name, type, 
fields, symbols, items, values, size.
+        For example, if an object has type, name, and size fields, then the 
name field should appear first, followed
+        by the type and then the size fields.
+        """
+        s = avro.schema.parse(json.dumps({
+            "symbols": ["a", "b"],
+            "type": "enum",
+            "name": "example"}))
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "name": "example",
+                "type": "enum",
+                "symbols": ["a", "b"]}))
+
+    def test_whitespace(self):
+        """
+        Eliminate all whitespace in JSON outside of string literals.
+        """
+        s = avro.schema.parse(
+            '''{"type": "fixed",
+            "size": 16,
+            "name": "md5"}
+                ''')
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "name": "md5",
+                "type": "fixed",
+                "size": 16}))
+
+    def test_record_field(self):
+        """
+        Ensure that record fields produce the correct parsing canonical form.
+        """
+        s = avro.schema.parse(json.dumps({
+            "type": "record",
+            "name": "Test",
+            "doc": "This is a test schema",
+            "aliases": ["also", "known", "as"],
+            "fields": [
+                {
+                    "type": {
+                        "symbols": ["one", "two"],
+                        "type": "enum",
+                        "name": "NamedEnum"},
+                    "name": "thenamedenum",
+                    "doc": "This is a named enum"
+                },
+                {
+                    "type": ["null", "NamedEnum"],
+                    "name": "unionwithreftoenum"
+                }
+            ]
+        }))
+        expected = self.compact_json_string({
+            "name": "Test",
+            "type": "record",
+            "fields": [
+                {
+                    "name": "thenamedenum",
+                    "type": {
+                        "name": "NamedEnum",
+                        "type": "enum",
+                        "symbols": ["one", "two"]
+                    }
+                },
+                {
+                    "name": "unionwithreftoenum",
+                    "type": ["null", "NamedEnum"]
+                }
+            ]
+        })
+        self.assertEqual(s.canonical_form, expected)
+
+    def test_array(self):
+        """
+        Ensure that array schema produce the correct parsing canonical form.
+        """
+        s = avro.schema.parse(json.dumps({
+            "items": "long",
+            "type": "array"}))
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "type": "array",
+                "items": "long"}))
+
+    def test_map(self):
+        """
+        Ensure that map schema produce the correct parsing canonical form.
+        """
+        s = avro.schema.parse(json.dumps({
+            "values": "long",
+            "type": "map"}))
+        self.assertEqual(
+            s.canonical_form,
+            self.compact_json_string({
+                "type": "map",
+                "values": "long"}))
+
+    def test_union(self):
+        """
+        Ensure that a union schema produces the correct parsing canonical form.
+        """
+        s = avro.schema.parse(json.dumps(["string", "null", "long"]))
+        self.assertEqual(
+            s.canonical_form,
+            '["string","null","long"]')
+
+    def test_large_record_handshake_request(self):
+        s = avro.schema.parse("""
+            {
+            "type": "record",
+            "name": "HandshakeRequest",
+            "namespace": "org.apache.avro.ipc",
+            "fields": [
+                {
+                "name": "clientHash",
+                "type": {"type": "fixed", "name": "MD5", "size": 16}
+                },
+                {"name": "clientProtocol", "type": ["null", "string"]},
+                {"name": "serverHash", "type": "MD5"},
+                {
+                "name": "meta",
+                "type": ["null", {"type": "map", "values": "bytes"}]
+                }
+            ]
+            }
+            """)
+        self.assertEqual(
+            s.canonical_form,
+            ('{"name":"org.apache.avro.ipc.HandshakeRequest","type":"record",'
+             
'"fields":[{"name":"clientHash","type":{"name":"org.apache.avro.ipc.MD5",'
+             
'"type":"fixed","size":16}},{"name":"clientProtocol","type":["null","string"]},'
+             
'{"name":"serverHash","type":{"name":"org.apache.avro.ipc.MD5","type":"fixed","size":16}},'
+             
'{"name":"meta","type":["null",{"type":"map","values":"bytes"}]}]}'))
+
+    def test_large_record_handshake_response(self):
+        s = avro.schema.parse("""
+            {
+            "type": "record",
+            "name": "HandshakeResponse",
+            "namespace": "org.apache.avro.ipc",
+            "fields": [
+                {
+                "name": "match",
+                "type": {
+                    "type": "enum",
+                    "name": "HandshakeMatch",
+                    "symbols": ["BOTH", "CLIENT", "NONE"]
+                }
+                },
+                {"name": "serverProtocol", "type": ["null", "string"]},
+                {
+                "name": "serverHash",
+                "type": ["null", {"name": "MD5", "size": 16, "type": "fixed"}]
+                },
+                {
+                "name": "meta",
+                "type": ["null", {"type": "map", "values": "bytes"}]}]
+                }
+            """)
+        self.assertEqual(
+            s.canonical_form,
+            ('{"name":"org.apache.avro.ipc.HandshakeResponse","type":"rec'
+             'ord","fields":[{"name":"match","type":{"name":"org.apache.a'
+             'vro.ipc.HandshakeMatch","type":"enum","symbols":["BOTH","CL'
+             'IENT","NONE"]}},{"name":"serverProtocol","type":["null","st'
+             'ring"]},{"name":"serverHash","type":["null",{"name":"org.ap'
+             'ache.avro.ipc.MD5","type":"fixed","size":16}]},{"name":"met'
+             'a","type":["null",{"type":"map","values":"bytes"}]}]}'))
+
+    def test_large_record_interop(self):
+        s = avro.schema.parse("""
+            {
+            "type": "record",
+            "name": "Interop",
+            "namespace": "org.apache.avro",
+            "fields": [
+                {"name": "intField", "type": "int"},
+                {"name": "longField", "type": "long"},
+                {"name": "stringField", "type": "string"},
+                {"name": "boolField", "type": "boolean"},
+                {"name": "floatField", "type": "float"},
+                {"name": "doubleField", "type": "double"},
+                {"name": "bytesField", "type": "bytes"},
+                {"name": "nullField", "type": "null"},
+                {"name": "arrayField", "type": {"type": "array", "items": 
"double"}},
+                {
+                "name": "mapField",
+                "type": {
+                    "type": "map",
+                    "values": {"name": "Foo",
+                            "type": "record",
+                            "fields": [{"name": "label", "type": "string"}]}
+                }
+                },
+                {
+                "name": "unionField",
+                "type": ["boolean", "double", {"type": "array", "items": 
"bytes"}]
+                },
+                {
+                "name": "enumField",
+                "type": {"type": "enum", "name": "Kind", "symbols": ["A", "B", 
"C"]}
+                },
+                {
+                "name": "fixedField",
+                "type": {"type": "fixed", "name": "MD5", "size": 16}
+                },
+                {
+                "name": "recordField",
+                "type": {"type": "record",
+                        "name": "Node",
+                        "fields": [{"name": "label", "type": "string"},
+                                    {"name": "children",
+                                    "type": {"type": "array",
+                                                "items": "Node"}}]}
+                }
+            ]
+            }
+            """)
+        self.assertEqual(
+            s.canonical_form,
+            ('{"name":"org.apache.avro.Interop","type":"record","fields":[{"na'
+             'me":"intField","type":"int"},{"name":"longField","type":"long"},'
+             '{"name":"stringField","type":"string"},{"name":"boolField","type'
+             '":"boolean"},{"name":"floatField","type":"float"},{"name":"doubl'
+             'eField","type":"double"},{"name":"bytesField","type":"bytes"},{"'
+             'name":"nullField","type":"null"},{"name":"arrayField","type":{"t'
+             'ype":"array","items":"double"}},{"name":"mapField","type":{"type'
+             '":"map","values":{"name":"org.apache.avro.Foo","type":"record","'
+             'fields":[{"name":"label","type":"string"}]}}},{"name":"unionFiel'
+             'd","type":["boolean","double",{"type":"array","items":"bytes"}]}'
+             ',{"name":"enumField","type":{"name":"org.apache.avro.Kind","type'
+             '":"enum","symbols":["A","B","C"]}},{"name":"fixedField","type":{'
+             '"name":"org.apache.avro.MD5","type":"fixed","size":16}},{"name":'
+             '"recordField","type":{"name":"org.apache.avro.Node","type":"reco'
+             'rd","fields":[{"name":"label","type":"string"},{"name":"children'
+             '","type":{"type":"array","items":"org.apache.avro.Node"}}]}}]}'))
+
+
 def load_tests(loader, default_tests, pattern):
     """Generate test cases across many test schema."""
     suite = unittest.TestSuite()
@@ -591,6 +1009,7 @@ def load_tests(loader, default_tests, pattern):
     suite.addTests(RoundTripParseTestCase(ex) for ex in VALID_EXAMPLES)
     suite.addTests(DocAttributesTestCase(ex) for ex in DOC_EXAMPLES)
     suite.addTests(OtherAttributesTestCase(ex) for ex in OTHER_PROP_EXAMPLES)
+    suite.addTests(loader.loadTestsFromTestCase(CanonicalFormTestCase))
     return suite

[avro] branch master updated: AVRO-1938: Generate Parsing Canonical Forms of Schema (#1167)

Reply via email to