This is an automated email from the ASF dual-hosted git repository.

mgrigorov pushed a commit to branch branch-1.11
in repository https://gitbox.apache.org/repos/asf/avro.git


The following commit(s) were added to refs/heads/branch-1.11 by this push:
     new 467466de7 AVRO-1938: Add fingerprinting support to Python 
implementation (#1181)
467466de7 is described below

commit 467466de77efdb03aa6df2924573b5c29306fd80
Author: Subhash Bhushan <[email protected]>
AuthorDate: Mon Jul 17 13:39:19 2023 -0700

    AVRO-1938: Add fingerprinting support to Python implementation (#1181)
    
    * AVRO-1938 Add support for fingerprinting schemas
    
    With this change, Schema fingerprints can be extracted by
    invoking the `fingerprint` method on the schema object. By default,
    fingerprints will be generated with the CRC-64 algorithm. Optinally,
    the algorithm can be supplied.
    
    All algorithms supported by hashlib are available, but Avro
    recommends using one among CRC-32, MD5, and SHA256 as per needs.
    
    * AVRO-1938 Fix issue with AbstractSet typecheck
    
    * Format with black
    
    * Freeze Supported Algorithms Set
    
    This commit addresses review comments and freezes the supported
    fingerprinting algorithms set.
    
    * Minor lint fix with black
    
    * Address Typecheck issues with Frozenset
    
    * Fold Fingerprint Mixin within Schema
    
    Addresses PR 1181 review comments. Methods within Fingerprint mixin
    have been made available at the module level, including static
    variables used in fingerprinting. This PR has been synced with latest
    master.
    
    * Add type hints to fingerprint methods/variables
    
    * Fix incorrect import sorting in schema.py to pass lint check
    
    * Address @kojiromike Jul 16 review comments
    
    * Address @kojiromike Jul 16 review comments - 2
    
    * Address @kojiromike Jul 17 review comments
    
    * Fix black lint issue
    
    (cherry picked from commit f504265943d929bcf0ba3ed133de511eb601efd0)
---
 lang/py/avro/errors.py           |   4 +
 lang/py/avro/schema.py           |  81 ++++++++++++-
 lang/py/avro/test/test_schema.py | 244 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 328 insertions(+), 1 deletion(-)

diff --git a/lang/py/avro/errors.py b/lang/py/avro/errors.py
index 2c7675131..b961a04ae 100644
--- a/lang/py/avro/errors.py
+++ b/lang/py/avro/errors.py
@@ -120,3 +120,7 @@ class UsageError(RuntimeError, AvroException):
 
 class AvroRuntimeException(RuntimeError, AvroException):
     """Raised when compatibility parsing encounters an unknown type"""
+
+
+class UnknownFingerprintAlgorithmException(AvroException):
+    """Raised when attempting to generate a fingerprint with an unknown 
algorithm"""
diff --git a/lang/py/avro/schema.py b/lang/py/avro/schema.py
index 8a64cb315..3efe1cf48 100644
--- a/lang/py/avro/schema.py
+++ b/lang/py/avro/schema.py
@@ -42,12 +42,23 @@ import abc
 import collections
 import datetime
 import decimal
+import hashlib
 import json
 import math
 import uuid
 import warnings
+from functools import reduce
 from pathlib import Path
-from typing import List, Mapping, MutableMapping, Optional, Sequence, Union, 
cast
+from typing import (
+    FrozenSet,
+    List,
+    Mapping,
+    MutableMapping,
+    Optional,
+    Sequence,
+    Union,
+    cast,
+)
 
 import avro.constants
 import avro.errors
@@ -104,6 +115,50 @@ def _is_timezone_aware_datetime(dt: datetime.datetime) -> 
bool:
     return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
 
 
+# Fingerprint Constants
+_EMPTY64_FINGERPRINT: int = 0xC15D213AA4D7A795
+_FINGERPRINT_TABLE: tuple = tuple(reduce(lambda fp, _: (fp >> 1) ^ 
(_EMPTY64_FINGERPRINT & -(fp & 1)), range(8), i) for i in range(256))
+
+
+# All algorithms guaranteed by hashlib are supported:
+#     - 'blake2b',
+#     - 'blake2s',
+#     - 'md5',
+#     - 'sha1',
+#     - 'sha224',
+#     - 'sha256',
+#     - 'sha384',
+#     - 'sha3_224',
+#     - 'sha3_256',
+#     - 'sha3_384',
+#     - 'sha3_512',
+#     - 'sha512',
+#     - 'shake_128',
+#     - 'shake_256'
+SUPPORTED_ALGORITHMS: FrozenSet[str] = frozenset({"CRC-64-AVRO"} | 
hashlib.algorithms_guaranteed)
+
+
+def _crc_64_fingerprint(data: bytes) -> bytes:
+    """The 64-bit Rabin Fingerprint.
+
+    As described in the Avro specification.
+
+    Args:
+        data: A bytes object containing the UTF-8 encoded parsing canonical
+        form of an Avro schema.
+    Returns:
+        A bytes object with a length of eight in little-endian format.
+    """
+    result = _EMPTY64_FINGERPRINT
+
+    for b in data:
+        result = (result >> 8) ^ _FINGERPRINT_TABLE[(result ^ b) & 0xFF]
+
+    # Although not mentioned in the Avro specification, the Java
+    # implementation gives fingerprint bytes in little-endian order
+    return result.to_bytes(length=8, byteorder="little", signed=False)
+
+
 #
 # Base Classes
 #
@@ -240,6 +295,30 @@ class Schema(abc.ABC, CanonicalPropertiesMixin):
         Consider the mixins EqualByPropsMixin and EqualByJsonMixin
         """
 
+    def fingerprint(self, algorithm="CRC-64-AVRO") -> bytes:
+        """
+        Generate fingerprint for supplied algorithm.
+
+        'CRC-64-AVRO' will be used as the algorithm by default, but any
+        algorithm supported by hashlib (as can be referenced with
+        `hashlib.algorithms_guaranteed`) can be specified.
+
+        `algorithm` param is used as an algorithm name, and 
NoSuchAlgorithmException
+        will be thrown if the algorithm is not among supported.
+        """
+        schema = self.canonical_form.encode("utf-8")
+
+        if algorithm == "CRC-64-AVRO":
+            return _crc_64_fingerprint(schema)
+
+        if algorithm not in SUPPORTED_ALGORITHMS:
+            raise avro.errors.UnknownFingerprintAlgorithmException(f"Unknown 
Fingerprint Algorithm: {algorithm}")
+
+        # Generate digests with hashlib for all other algorithms
+        # Lowercase algorithm to support algorithm strings sent by other 
languages like Java
+        h = hashlib.new(algorithm.lower(), schema)
+        return h.digest()
+
 
 class NamedSchema(Schema):
     """Named Schemas specified in NAMED_TYPES."""
diff --git a/lang/py/avro/test/test_schema.py b/lang/py/avro/test/test_schema.py
index c59ded8a7..668ca8258 100644
--- a/lang/py/avro/test/test_schema.py
+++ b/lang/py/avro/test/test_schema.py
@@ -519,6 +519,204 @@ IGNORED_LOGICAL_TYPE = [
     ),
 ]
 
+
+# Fingerprint examples are in the form of tuples:
+# - Value in Position 0 is schema
+# - Value in Position 1 is an array of fingerprints:
+#     - Position 0 is CRC-64-AVRO fingerprint
+#     - Position 0 is MD5 fingerprint
+#     - Position 0 is SHA256 fingerprint
+FINGERPRINT_EXAMPLES = [
+    ('"int"', ["8f5c393f1ad57572", "ef524ea1b91e73173d938ade36c1db32", 
"3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]),
+    ('{"type": "int"}', ["8f5c393f1ad57572", 
"ef524ea1b91e73173d938ade36c1db32", 
"3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]),
+    ('"float"', ["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6", 
"1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"]),
+    (
+        '{"type": "float"}',
+        ["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6", 
"1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"],
+    ),
+    ('"long"', ["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b", 
"c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"]),
+    (
+        '{"type": "long"}',
+        ["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b", 
"c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"],
+    ),
+    ('"double"', ["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6", 
"730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"]),
+    (
+        '{"type": "double"}',
+        ["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6", 
"730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"],
+    ),
+    ('"bytes"', ["651920c3da16c04f", "b462f06cb909be57c85008867784cde6", 
"9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"]),
+    (
+        '{"type": "bytes"}',
+        ["651920c3da16c04f", "b462f06cb909be57c85008867784cde6", 
"9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"],
+    ),
+    ('"string"', ["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d", 
"e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"]),
+    (
+        '{"type": "string"}',
+        ["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d", 
"e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"],
+    ),
+    ('"boolean"', ["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f", 
"a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"]),
+    (
+        '{"type": "boolean"}',
+        ["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f", 
"a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"],
+    ),
+    ('"null"', ["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699", 
"f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"]),
+    (
+        '{"type": "null"}',
+        ["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699", 
"f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"],
+    ),
+    (
+        '{"type": "fixed", "name": "Test", "size": 1}',
+        ["6869897b4049355b", "db01bc515fcfcd2d4be82ed385288261", 
"f527116a6f44455697e935afc31dc60ad0f95caf35e1d9c9db62edb3ffeb9170"],
+    ),
+    (
+        json.dumps({"type": "fixed", "name": "MyFixed", "namespace": 
"org.apache.hadoop.avro", "size": 1}),
+        ["fadbd138e85bdf45", "d74b3726484422711c465d49e857b1ba", 
"28e493a44771cecc5deca4bd938cdc3d5a24cfe1f3760bc938fa1057df6334fc"],
+    ),
+    (
+        '{"type": "enum", "name": "Test", "symbols": ["A", "B"]}',
+        ["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1", 
"9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"],
+    ),
+    (
+        '{"type": "array", "items": "long"}',
+        ["715e2ea28bc91654", "c1c387e8d6a58f0df749b698991b1f43", 
"f78e954167feb23dcb1ce01e8463cebf3408e0a4259e16f24bd38f6d0f1d578b"],
+    ),
+    (
+        json.dumps({"type": "array", "items": {"type": "enum", "name": "Test", 
"symbols": ["A", "B"]}}),
+        ["10d9ade1fa3a0387", "cfc7b861c7cfef082a6ef082948893fa", 
"0d8edd49d7f7e9553668f133577bc99f842852b55d9f84f1f7511e4961aa685c"],
+    ),
+    (
+        '{"type": "map", "values": "long"}',
+        ["6f74f4e409b1334e", "32b3f1a3177a0e73017920f00448b56e", 
"b8fad07d458971a07692206b8a7cf626c86c62fe6bcff7c1b11bc7295de34853"],
+    ),
+    (
+        json.dumps({"type": "map", "values": {"type": "enum", "name": "Test", 
"symbols": ["A", "B"]}}),
+        ["df2ab0626f6b812d", "c588da6ba99701c41e73fd30d23f994e", 
"3886747ed1669a8af476b549e97b34222afb2fed5f18bb27c6f367ea0351a576"],
+    ),
+    (
+        '["string", "null", "long"]',
+        ["65a5be410d687566", "b11cf95f0a55dd55f9ee515a37bf937a", 
"ed8d254116441bb35e237ad0563cf5432b8c975334bd222c1ee84609435d95bb"],
+    ),
+    (
+        json.dumps({"type": "record", "name": "Test", "fields": [{"name": "f", 
"type": "long"}]}),
+        ["ed94e5f5e6eb588e", "69531a03db788afe353244cd049b1e6d", 
"9670f15a8f96d23e92830d00b8bd57275e02e3e173ffef7c253c170b6beabeb8"],
+    ),
+    (
+        json.dumps(
+            {
+                "type": "record",
+                "name": "Node",
+                "fields": [{"name": "label", "type": "string"}, {"name": 
"children", "type": {"type": "array", "items": "Node"}}],
+            }
+        ),
+        ["52cba544c3e756b7", "99625b0cc02050363e89ef66b0f406c9", 
"65d80dc8c95c98a9671d92cf0415edfabfee2cb058df2138606656cd6ae4dc59"],
+    ),
+    (
+        json.dumps(
+            {
+                "type": "record",
+                "name": "Lisp",
+                "fields": [
+                    {
+                        "name": "value",
+                        "type": [
+                            "null",
+                            "string",
+                            {"type": "record", "name": "Cons", "fields": 
[{"name": "car", "type": "Lisp"}, {"name": "cdr", "type": "Lisp"}]},
+                        ],
+                    }
+                ],
+            }
+        ),
+        ["68d91a23eda0b306", "9e1d0d15b52789fcb8e3a88b53059d5f", 
"e5ce4f4a15ce19fa1047cfe16a3b0e13a755db40f00f23284fdd376fc1c7dd21"],
+    ),
+    (
+        json.dumps(
+            {
+                "type": "record",
+                "name": "HandshakeRequest",
+                "namespace": "org.apache.avro.ipc",
+                "fields": [
+                    {"name": "clientHash", "type": {"type": "fixed", "name": 
"MD5", "size": 16}},
+                    {"name": "clientProtocol", "type": ["null", "string"]},
+                    {"name": "serverHash", "type": "MD5"},
+                    {"name": "meta", "type": ["null", {"type": "map", 
"values": "bytes"}]},
+                ],
+            }
+        ),
+        ["43818703b7b5d769", "16ded8b5027e80a17704c6565c0c3f1b", 
"6c317314687da52a85c813a7f0c92298a60b79625b9acc072e4d9e4256a1d800"],
+    ),
+    (
+        json.dumps(
+            {
+                "type": "record",
+                "name": "HandshakeResponse",
+                "namespace": "org.apache.avro.ipc",
+                "fields": [
+                    {"name": "match", "type": {"type": "enum", "name": 
"HandshakeMatch", "symbols": ["BOTH", "CLIENT", "NONE"]}},
+                    {"name": "serverProtocol", "type": ["null", "string"]},
+                    {"name": "serverHash", "type": ["null", {"name": "MD5", 
"size": 16, "type": "fixed"}]},
+                    {"name": "meta", "type": ["null", {"type": "map", 
"values": "bytes"}]},
+                ],
+            }
+        ),
+        ["00feee01de4ea50e", "afe529d01132daab7f4e2a6663e7a2f5", 
"a303cbbfe13958f880605d70c521a4b7be34d9265ac5a848f25916a67b11d889"],
+    ),
+    (
+        json.dumps(
+            {
+                "type": "record",
+                "name": "Interop",
+                "namespace": "org.apache.avro",
+                "fields": [
+                    {"name": "intField", "type": "int"},
+                    {"name": "longField", "type": "long"},
+                    {"name": "stringField", "type": "string"},
+                    {"name": "boolField", "type": "boolean"},
+                    {"name": "floatField", "type": "float"},
+                    {"name": "doubleField", "type": "double"},
+                    {"name": "bytesField", "type": "bytes"},
+                    {"name": "nullField", "type": "null"},
+                    {"name": "arrayField", "type": {"type": "array", "items": 
"double"}},
+                    {
+                        "name": "mapField",
+                        "type": {"type": "map", "values": {"name": "Foo", 
"type": "record", "fields": [{"name": "label", "type": "string"}]}},
+                    },
+                    {"name": "unionField", "type": ["boolean", "double", 
{"type": "array", "items": "bytes"}]},
+                    {"name": "enumField", "type": {"type": "enum", "name": 
"Kind", "symbols": ["A", "B", "C"]}},
+                    {"name": "fixedField", "type": {"type": "fixed", "name": 
"MD5", "size": 16}},
+                    {
+                        "name": "recordField",
+                        "type": {
+                            "type": "record",
+                            "name": "Node",
+                            "fields": [{"name": "label", "type": "string"}, 
{"name": "children", "type": {"type": "array", "items": "Node"}}],
+                        },
+                    },
+                ],
+            }
+        ),
+        ["e82c0a93a6a0b5a4", "994fea1a1be7ff8603cbe40c3bc7e4ca", 
"cccfd6e3f917cf53b0f90c206342e6703b0d905071f724a1c1f85b731c74058d"],
+    ),
+    (
+        json.dumps(
+            {
+                "type": "record",
+                "name": "ipAddr",
+                "fields": [{"name": "addr", "type": [{"name": "IPv6", "type": 
"fixed", "size": 16}, {"name": "IPv4", "type": "fixed", "size": 4}]}],
+            }
+        ),
+        ["8d961b4e298a1844", "45d85c69b353a99b93d7c4f2fcf0c30d", 
"6f6fc8f685a4f07d99734946565d63108806d55a8620febea047cf52cb0ac181"],
+    ),
+    (
+        json.dumps({"type": "record", "name": "TestDoc", "doc": "Doc string", 
"fields": [{"name": "name", "type": "string", "doc": "Doc String"}]}),
+        ["0e6660f02bcdc109", "f2da75f5131f5ab80629538287b8beb2", 
"0b3644f7aa5ca2fc4bad93ca2d3609c12aa9dbda9c15e68b34c120beff08e7b9"],
+    ),
+    (
+        '{"type": "enum", "name": "Test", "symbols": ["A", "B"], "doc": "Doc 
String"}',
+        ["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1", 
"9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"],
+    ),
+]
+
 EXAMPLES = PRIMITIVE_EXAMPLES
 EXAMPLES += FIXED_EXAMPLES
 EXAMPLES += ENUM_EXAMPLES
@@ -634,13 +832,25 @@ class TestMisc(unittest.TestCase):
     def test_parse_invalid_symbol(self):
         """Disabling enumschema symbol validation should allow invalid symbols 
to pass."""
         test_schema_string = json.dumps({"type": "enum", "name": "AVRO2174", 
"symbols": ["white space"]})
+
         with self.assertRaises(avro.errors.InvalidName, msg="When enum symbol 
validation is enabled, an invalid symbol should raise InvalidName."):
             avro.schema.parse(test_schema_string, validate_enum_symbols=True)
+
         try:
             avro.schema.parse(test_schema_string, validate_enum_symbols=False)
         except avro.errors.InvalidName:  # pragma: no coverage
             self.fail("When enum symbol validation is disabled, an invalid 
symbol should not raise InvalidName.")
 
+    def test_unsupported_fingerprint_algorithm(self):
+        s = avro.schema.parse('"int"')
+        self.assertRaises(avro.errors.UnknownFingerprintAlgorithmException, 
s.fingerprint, "foo")
+
+    def test_less_popular_fingerprint_algorithm(self):
+        s = avro.schema.parse('"int"')
+        fingerprint = s.fingerprint("sha384")
+        hex_fingerprint = "".join(format(b, "02x") for b in 
fingerprint).zfill(16)
+        self.assertEqual(hex_fingerprint, 
"32ed5e4ac896570f044d1dab68f4c8ca9866ac06d22261f399316bf4799e16854750238085775107dfac905c82b2feaf")
+
 
 class SchemaParseTestCase(unittest.TestCase):
     """Enable generating parse test cases over all the valid and invalid 
example schema."""
@@ -1181,6 +1391,39 @@ class CanonicalFormTestCase(unittest.TestCase):
         )
 
 
+class FingerprintTestCase(unittest.TestCase):
+    """
+    Enable generating fingerprint test cases across algorithms.
+
+    Fingerprint examples are in the form of tuples:
+        - Value in Position 0 is schema
+        - Value in Position 1 is an array of fingerprints:
+            - Position 0 is CRC-64-AVRO fingerprint
+            - Position 0 is MD5 fingerprint
+            - Position 0 is SHA256 fingerprint
+    """
+
+    def __init__(self, test_schema, fingerprints):
+        """Ignore the normal signature for unittest.TestCase because we are 
generating
+        many test cases from this one class. This is safe as long as the 
autoloader
+        ignores this class. The autoloader will ignore this class as long as 
it has
+        no methods starting with `test_`.
+        """
+        super(FingerprintTestCase, self).__init__("validate_fingerprint")
+        self.test_schema = test_schema
+        self.fingerprints = fingerprints
+
+    def _hex_fingerprint(self, fingerprint):
+        return "".join(format(b, "02x") for b in fingerprint).zfill(16)
+
+    def validate_fingerprint(self):
+        """The string of a Schema should be parseable to the same Schema."""
+        s = avro.schema.parse(self.test_schema)
+        self.assertEqual(self._hex_fingerprint(s.fingerprint()), 
self.fingerprints[0])
+        self.assertEqual(self._hex_fingerprint(s.fingerprint("md5")), 
self.fingerprints[1])
+        self.assertEqual(self._hex_fingerprint(s.fingerprint("sha256")), 
self.fingerprints[2])
+
+
 def load_tests(loader, default_tests, pattern):
     """Generate test cases across many test schema."""
     suite = unittest.TestSuite()
@@ -1190,6 +1433,7 @@ def load_tests(loader, default_tests, pattern):
     suite.addTests(DocAttributesTestCase(ex) for ex in DOC_EXAMPLES)
     suite.addTests(OtherAttributesTestCase(ex) for ex in OTHER_PROP_EXAMPLES)
     suite.addTests(loader.loadTestsFromTestCase(CanonicalFormTestCase))
+    suite.addTests(FingerprintTestCase(ex[0], ex[1]) for ex in 
FINGERPRINT_EXAMPLES)
     return suite
 
 

Reply via email to