This is an automated email from the ASF dual-hosted git repository.
mgrigorov pushed a commit to branch branch-1.11
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/branch-1.11 by this push:
new 467466de7 AVRO-1938: Add fingerprinting support to Python
implementation (#1181)
467466de7 is described below
commit 467466de77efdb03aa6df2924573b5c29306fd80
Author: Subhash Bhushan <[email protected]>
AuthorDate: Mon Jul 17 13:39:19 2023 -0700
AVRO-1938: Add fingerprinting support to Python implementation (#1181)
* AVRO-1938 Add support for fingerprinting schemas
With this change, Schema fingerprints can be extracted by
invoking the `fingerprint` method on the schema object. By default,
fingerprints will be generated with the CRC-64 algorithm. Optinally,
the algorithm can be supplied.
All algorithms supported by hashlib are available, but Avro
recommends using one among CRC-32, MD5, and SHA256 as per needs.
* AVRO-1938 Fix issue with AbstractSet typecheck
* Format with black
* Freeze Supported Algorithms Set
This commit addresses review comments and freezes the supported
fingerprinting algorithms set.
* Minor lint fix with black
* Address Typecheck issues with Frozenset
* Fold Fingerprint Mixin within Schema
Addresses PR 1181 review comments. Methods within Fingerprint mixin
have been made available at the module level, including static
variables used in fingerprinting. This PR has been synced with latest
master.
* Add type hints to fingerprint methods/variables
* Fix incorrect import sorting in schema.py to pass lint check
* Address @kojiromike Jul 16 review comments
* Address @kojiromike Jul 16 review comments - 2
* Address @kojiromike Jul 17 review comments
* Fix black lint issue
(cherry picked from commit f504265943d929bcf0ba3ed133de511eb601efd0)
---
lang/py/avro/errors.py | 4 +
lang/py/avro/schema.py | 81 ++++++++++++-
lang/py/avro/test/test_schema.py | 244 +++++++++++++++++++++++++++++++++++++++
3 files changed, 328 insertions(+), 1 deletion(-)
diff --git a/lang/py/avro/errors.py b/lang/py/avro/errors.py
index 2c7675131..b961a04ae 100644
--- a/lang/py/avro/errors.py
+++ b/lang/py/avro/errors.py
@@ -120,3 +120,7 @@ class UsageError(RuntimeError, AvroException):
class AvroRuntimeException(RuntimeError, AvroException):
"""Raised when compatibility parsing encounters an unknown type"""
+
+
+class UnknownFingerprintAlgorithmException(AvroException):
+ """Raised when attempting to generate a fingerprint with an unknown
algorithm"""
diff --git a/lang/py/avro/schema.py b/lang/py/avro/schema.py
index 8a64cb315..3efe1cf48 100644
--- a/lang/py/avro/schema.py
+++ b/lang/py/avro/schema.py
@@ -42,12 +42,23 @@ import abc
import collections
import datetime
import decimal
+import hashlib
import json
import math
import uuid
import warnings
+from functools import reduce
from pathlib import Path
-from typing import List, Mapping, MutableMapping, Optional, Sequence, Union,
cast
+from typing import (
+ FrozenSet,
+ List,
+ Mapping,
+ MutableMapping,
+ Optional,
+ Sequence,
+ Union,
+ cast,
+)
import avro.constants
import avro.errors
@@ -104,6 +115,50 @@ def _is_timezone_aware_datetime(dt: datetime.datetime) ->
bool:
return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
+# Fingerprint Constants
+_EMPTY64_FINGERPRINT: int = 0xC15D213AA4D7A795
+_FINGERPRINT_TABLE: tuple = tuple(reduce(lambda fp, _: (fp >> 1) ^
(_EMPTY64_FINGERPRINT & -(fp & 1)), range(8), i) for i in range(256))
+
+
+# All algorithms guaranteed by hashlib are supported:
+# - 'blake2b',
+# - 'blake2s',
+# - 'md5',
+# - 'sha1',
+# - 'sha224',
+# - 'sha256',
+# - 'sha384',
+# - 'sha3_224',
+# - 'sha3_256',
+# - 'sha3_384',
+# - 'sha3_512',
+# - 'sha512',
+# - 'shake_128',
+# - 'shake_256'
+SUPPORTED_ALGORITHMS: FrozenSet[str] = frozenset({"CRC-64-AVRO"} |
hashlib.algorithms_guaranteed)
+
+
+def _crc_64_fingerprint(data: bytes) -> bytes:
+ """The 64-bit Rabin Fingerprint.
+
+ As described in the Avro specification.
+
+ Args:
+ data: A bytes object containing the UTF-8 encoded parsing canonical
+ form of an Avro schema.
+ Returns:
+ A bytes object with a length of eight in little-endian format.
+ """
+ result = _EMPTY64_FINGERPRINT
+
+ for b in data:
+ result = (result >> 8) ^ _FINGERPRINT_TABLE[(result ^ b) & 0xFF]
+
+ # Although not mentioned in the Avro specification, the Java
+ # implementation gives fingerprint bytes in little-endian order
+ return result.to_bytes(length=8, byteorder="little", signed=False)
+
+
#
# Base Classes
#
@@ -240,6 +295,30 @@ class Schema(abc.ABC, CanonicalPropertiesMixin):
Consider the mixins EqualByPropsMixin and EqualByJsonMixin
"""
+ def fingerprint(self, algorithm="CRC-64-AVRO") -> bytes:
+ """
+ Generate fingerprint for supplied algorithm.
+
+ 'CRC-64-AVRO' will be used as the algorithm by default, but any
+ algorithm supported by hashlib (as can be referenced with
+ `hashlib.algorithms_guaranteed`) can be specified.
+
+ `algorithm` param is used as an algorithm name, and
NoSuchAlgorithmException
+ will be thrown if the algorithm is not among supported.
+ """
+ schema = self.canonical_form.encode("utf-8")
+
+ if algorithm == "CRC-64-AVRO":
+ return _crc_64_fingerprint(schema)
+
+ if algorithm not in SUPPORTED_ALGORITHMS:
+ raise avro.errors.UnknownFingerprintAlgorithmException(f"Unknown
Fingerprint Algorithm: {algorithm}")
+
+ # Generate digests with hashlib for all other algorithms
+ # Lowercase algorithm to support algorithm strings sent by other
languages like Java
+ h = hashlib.new(algorithm.lower(), schema)
+ return h.digest()
+
class NamedSchema(Schema):
"""Named Schemas specified in NAMED_TYPES."""
diff --git a/lang/py/avro/test/test_schema.py b/lang/py/avro/test/test_schema.py
index c59ded8a7..668ca8258 100644
--- a/lang/py/avro/test/test_schema.py
+++ b/lang/py/avro/test/test_schema.py
@@ -519,6 +519,204 @@ IGNORED_LOGICAL_TYPE = [
),
]
+
+# Fingerprint examples are in the form of tuples:
+# - Value in Position 0 is schema
+# - Value in Position 1 is an array of fingerprints:
+# - Position 0 is CRC-64-AVRO fingerprint
+# - Position 0 is MD5 fingerprint
+# - Position 0 is SHA256 fingerprint
+FINGERPRINT_EXAMPLES = [
+ ('"int"', ["8f5c393f1ad57572", "ef524ea1b91e73173d938ade36c1db32",
"3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]),
+ ('{"type": "int"}', ["8f5c393f1ad57572",
"ef524ea1b91e73173d938ade36c1db32",
"3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]),
+ ('"float"', ["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6",
"1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"]),
+ (
+ '{"type": "float"}',
+ ["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6",
"1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"],
+ ),
+ ('"long"', ["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b",
"c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"]),
+ (
+ '{"type": "long"}',
+ ["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b",
"c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"],
+ ),
+ ('"double"', ["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6",
"730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"]),
+ (
+ '{"type": "double"}',
+ ["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6",
"730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"],
+ ),
+ ('"bytes"', ["651920c3da16c04f", "b462f06cb909be57c85008867784cde6",
"9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"]),
+ (
+ '{"type": "bytes"}',
+ ["651920c3da16c04f", "b462f06cb909be57c85008867784cde6",
"9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"],
+ ),
+ ('"string"', ["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d",
"e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"]),
+ (
+ '{"type": "string"}',
+ ["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d",
"e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"],
+ ),
+ ('"boolean"', ["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f",
"a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"]),
+ (
+ '{"type": "boolean"}',
+ ["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f",
"a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"],
+ ),
+ ('"null"', ["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699",
"f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"]),
+ (
+ '{"type": "null"}',
+ ["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699",
"f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"],
+ ),
+ (
+ '{"type": "fixed", "name": "Test", "size": 1}',
+ ["6869897b4049355b", "db01bc515fcfcd2d4be82ed385288261",
"f527116a6f44455697e935afc31dc60ad0f95caf35e1d9c9db62edb3ffeb9170"],
+ ),
+ (
+ json.dumps({"type": "fixed", "name": "MyFixed", "namespace":
"org.apache.hadoop.avro", "size": 1}),
+ ["fadbd138e85bdf45", "d74b3726484422711c465d49e857b1ba",
"28e493a44771cecc5deca4bd938cdc3d5a24cfe1f3760bc938fa1057df6334fc"],
+ ),
+ (
+ '{"type": "enum", "name": "Test", "symbols": ["A", "B"]}',
+ ["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1",
"9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"],
+ ),
+ (
+ '{"type": "array", "items": "long"}',
+ ["715e2ea28bc91654", "c1c387e8d6a58f0df749b698991b1f43",
"f78e954167feb23dcb1ce01e8463cebf3408e0a4259e16f24bd38f6d0f1d578b"],
+ ),
+ (
+ json.dumps({"type": "array", "items": {"type": "enum", "name": "Test",
"symbols": ["A", "B"]}}),
+ ["10d9ade1fa3a0387", "cfc7b861c7cfef082a6ef082948893fa",
"0d8edd49d7f7e9553668f133577bc99f842852b55d9f84f1f7511e4961aa685c"],
+ ),
+ (
+ '{"type": "map", "values": "long"}',
+ ["6f74f4e409b1334e", "32b3f1a3177a0e73017920f00448b56e",
"b8fad07d458971a07692206b8a7cf626c86c62fe6bcff7c1b11bc7295de34853"],
+ ),
+ (
+ json.dumps({"type": "map", "values": {"type": "enum", "name": "Test",
"symbols": ["A", "B"]}}),
+ ["df2ab0626f6b812d", "c588da6ba99701c41e73fd30d23f994e",
"3886747ed1669a8af476b549e97b34222afb2fed5f18bb27c6f367ea0351a576"],
+ ),
+ (
+ '["string", "null", "long"]',
+ ["65a5be410d687566", "b11cf95f0a55dd55f9ee515a37bf937a",
"ed8d254116441bb35e237ad0563cf5432b8c975334bd222c1ee84609435d95bb"],
+ ),
+ (
+ json.dumps({"type": "record", "name": "Test", "fields": [{"name": "f",
"type": "long"}]}),
+ ["ed94e5f5e6eb588e", "69531a03db788afe353244cd049b1e6d",
"9670f15a8f96d23e92830d00b8bd57275e02e3e173ffef7c253c170b6beabeb8"],
+ ),
+ (
+ json.dumps(
+ {
+ "type": "record",
+ "name": "Node",
+ "fields": [{"name": "label", "type": "string"}, {"name":
"children", "type": {"type": "array", "items": "Node"}}],
+ }
+ ),
+ ["52cba544c3e756b7", "99625b0cc02050363e89ef66b0f406c9",
"65d80dc8c95c98a9671d92cf0415edfabfee2cb058df2138606656cd6ae4dc59"],
+ ),
+ (
+ json.dumps(
+ {
+ "type": "record",
+ "name": "Lisp",
+ "fields": [
+ {
+ "name": "value",
+ "type": [
+ "null",
+ "string",
+ {"type": "record", "name": "Cons", "fields":
[{"name": "car", "type": "Lisp"}, {"name": "cdr", "type": "Lisp"}]},
+ ],
+ }
+ ],
+ }
+ ),
+ ["68d91a23eda0b306", "9e1d0d15b52789fcb8e3a88b53059d5f",
"e5ce4f4a15ce19fa1047cfe16a3b0e13a755db40f00f23284fdd376fc1c7dd21"],
+ ),
+ (
+ json.dumps(
+ {
+ "type": "record",
+ "name": "HandshakeRequest",
+ "namespace": "org.apache.avro.ipc",
+ "fields": [
+ {"name": "clientHash", "type": {"type": "fixed", "name":
"MD5", "size": 16}},
+ {"name": "clientProtocol", "type": ["null", "string"]},
+ {"name": "serverHash", "type": "MD5"},
+ {"name": "meta", "type": ["null", {"type": "map",
"values": "bytes"}]},
+ ],
+ }
+ ),
+ ["43818703b7b5d769", "16ded8b5027e80a17704c6565c0c3f1b",
"6c317314687da52a85c813a7f0c92298a60b79625b9acc072e4d9e4256a1d800"],
+ ),
+ (
+ json.dumps(
+ {
+ "type": "record",
+ "name": "HandshakeResponse",
+ "namespace": "org.apache.avro.ipc",
+ "fields": [
+ {"name": "match", "type": {"type": "enum", "name":
"HandshakeMatch", "symbols": ["BOTH", "CLIENT", "NONE"]}},
+ {"name": "serverProtocol", "type": ["null", "string"]},
+ {"name": "serverHash", "type": ["null", {"name": "MD5",
"size": 16, "type": "fixed"}]},
+ {"name": "meta", "type": ["null", {"type": "map",
"values": "bytes"}]},
+ ],
+ }
+ ),
+ ["00feee01de4ea50e", "afe529d01132daab7f4e2a6663e7a2f5",
"a303cbbfe13958f880605d70c521a4b7be34d9265ac5a848f25916a67b11d889"],
+ ),
+ (
+ json.dumps(
+ {
+ "type": "record",
+ "name": "Interop",
+ "namespace": "org.apache.avro",
+ "fields": [
+ {"name": "intField", "type": "int"},
+ {"name": "longField", "type": "long"},
+ {"name": "stringField", "type": "string"},
+ {"name": "boolField", "type": "boolean"},
+ {"name": "floatField", "type": "float"},
+ {"name": "doubleField", "type": "double"},
+ {"name": "bytesField", "type": "bytes"},
+ {"name": "nullField", "type": "null"},
+ {"name": "arrayField", "type": {"type": "array", "items":
"double"}},
+ {
+ "name": "mapField",
+ "type": {"type": "map", "values": {"name": "Foo",
"type": "record", "fields": [{"name": "label", "type": "string"}]}},
+ },
+ {"name": "unionField", "type": ["boolean", "double",
{"type": "array", "items": "bytes"}]},
+ {"name": "enumField", "type": {"type": "enum", "name":
"Kind", "symbols": ["A", "B", "C"]}},
+ {"name": "fixedField", "type": {"type": "fixed", "name":
"MD5", "size": 16}},
+ {
+ "name": "recordField",
+ "type": {
+ "type": "record",
+ "name": "Node",
+ "fields": [{"name": "label", "type": "string"},
{"name": "children", "type": {"type": "array", "items": "Node"}}],
+ },
+ },
+ ],
+ }
+ ),
+ ["e82c0a93a6a0b5a4", "994fea1a1be7ff8603cbe40c3bc7e4ca",
"cccfd6e3f917cf53b0f90c206342e6703b0d905071f724a1c1f85b731c74058d"],
+ ),
+ (
+ json.dumps(
+ {
+ "type": "record",
+ "name": "ipAddr",
+ "fields": [{"name": "addr", "type": [{"name": "IPv6", "type":
"fixed", "size": 16}, {"name": "IPv4", "type": "fixed", "size": 4}]}],
+ }
+ ),
+ ["8d961b4e298a1844", "45d85c69b353a99b93d7c4f2fcf0c30d",
"6f6fc8f685a4f07d99734946565d63108806d55a8620febea047cf52cb0ac181"],
+ ),
+ (
+ json.dumps({"type": "record", "name": "TestDoc", "doc": "Doc string",
"fields": [{"name": "name", "type": "string", "doc": "Doc String"}]}),
+ ["0e6660f02bcdc109", "f2da75f5131f5ab80629538287b8beb2",
"0b3644f7aa5ca2fc4bad93ca2d3609c12aa9dbda9c15e68b34c120beff08e7b9"],
+ ),
+ (
+ '{"type": "enum", "name": "Test", "symbols": ["A", "B"], "doc": "Doc
String"}',
+ ["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1",
"9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"],
+ ),
+]
+
EXAMPLES = PRIMITIVE_EXAMPLES
EXAMPLES += FIXED_EXAMPLES
EXAMPLES += ENUM_EXAMPLES
@@ -634,13 +832,25 @@ class TestMisc(unittest.TestCase):
def test_parse_invalid_symbol(self):
"""Disabling enumschema symbol validation should allow invalid symbols
to pass."""
test_schema_string = json.dumps({"type": "enum", "name": "AVRO2174",
"symbols": ["white space"]})
+
with self.assertRaises(avro.errors.InvalidName, msg="When enum symbol
validation is enabled, an invalid symbol should raise InvalidName."):
avro.schema.parse(test_schema_string, validate_enum_symbols=True)
+
try:
avro.schema.parse(test_schema_string, validate_enum_symbols=False)
except avro.errors.InvalidName: # pragma: no coverage
self.fail("When enum symbol validation is disabled, an invalid
symbol should not raise InvalidName.")
+ def test_unsupported_fingerprint_algorithm(self):
+ s = avro.schema.parse('"int"')
+ self.assertRaises(avro.errors.UnknownFingerprintAlgorithmException,
s.fingerprint, "foo")
+
+ def test_less_popular_fingerprint_algorithm(self):
+ s = avro.schema.parse('"int"')
+ fingerprint = s.fingerprint("sha384")
+ hex_fingerprint = "".join(format(b, "02x") for b in
fingerprint).zfill(16)
+ self.assertEqual(hex_fingerprint,
"32ed5e4ac896570f044d1dab68f4c8ca9866ac06d22261f399316bf4799e16854750238085775107dfac905c82b2feaf")
+
class SchemaParseTestCase(unittest.TestCase):
"""Enable generating parse test cases over all the valid and invalid
example schema."""
@@ -1181,6 +1391,39 @@ class CanonicalFormTestCase(unittest.TestCase):
)
+class FingerprintTestCase(unittest.TestCase):
+ """
+ Enable generating fingerprint test cases across algorithms.
+
+ Fingerprint examples are in the form of tuples:
+ - Value in Position 0 is schema
+ - Value in Position 1 is an array of fingerprints:
+ - Position 0 is CRC-64-AVRO fingerprint
+ - Position 0 is MD5 fingerprint
+ - Position 0 is SHA256 fingerprint
+ """
+
+ def __init__(self, test_schema, fingerprints):
+ """Ignore the normal signature for unittest.TestCase because we are
generating
+ many test cases from this one class. This is safe as long as the
autoloader
+ ignores this class. The autoloader will ignore this class as long as
it has
+ no methods starting with `test_`.
+ """
+ super(FingerprintTestCase, self).__init__("validate_fingerprint")
+ self.test_schema = test_schema
+ self.fingerprints = fingerprints
+
+ def _hex_fingerprint(self, fingerprint):
+ return "".join(format(b, "02x") for b in fingerprint).zfill(16)
+
+ def validate_fingerprint(self):
+ """The string of a Schema should be parseable to the same Schema."""
+ s = avro.schema.parse(self.test_schema)
+ self.assertEqual(self._hex_fingerprint(s.fingerprint()),
self.fingerprints[0])
+ self.assertEqual(self._hex_fingerprint(s.fingerprint("md5")),
self.fingerprints[1])
+ self.assertEqual(self._hex_fingerprint(s.fingerprint("sha256")),
self.fingerprints[2])
+
+
def load_tests(loader, default_tests, pattern):
"""Generate test cases across many test schema."""
suite = unittest.TestSuite()
@@ -1190,6 +1433,7 @@ def load_tests(loader, default_tests, pattern):
suite.addTests(DocAttributesTestCase(ex) for ex in DOC_EXAMPLES)
suite.addTests(OtherAttributesTestCase(ex) for ex in OTHER_PROP_EXAMPLES)
suite.addTests(loader.loadTestsFromTestCase(CanonicalFormTestCase))
+ suite.addTests(FingerprintTestCase(ex[0], ex[1]) for ex in
FINGERPRINT_EXAMPLES)
return suite