This is an automated email from the ASF dual-hosted git repository.
chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git
The following commit(s) were added to refs/heads/main by this push:
new ef6f68ecd feat(python): type meta encoding for python (#2509)
ef6f68ecd is described below
commit ef6f68ecda23e6b47cd34b26eafcc33de05201cb
Author: Shawn Yang <[email protected]>
AuthorDate: Mon Aug 25 22:48:46 2025 +0800
feat(python): type meta encoding for python (#2509)
## Why?
type forward/backward compatible serialization is critical for online
service which different service update their data schema and deploy at
different time. The schema is in an inconsistent state.
meta shared encoding can address this :
https://fory.apache.org/docs/specification/fory_xlang_serialization_spec#type-def
## What does this PR do?
Add type meta encoding for python to support type forward/backward
compatible serialization.
Things not finished in this PR:
- not null field support
- generate serializer from type meta
- meta share mode
Those feature will be implemented in follow-up PRs.
## Related issues
#1938
#2160
#2278
## Does this PR introduce any user-facing change?
<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fory/issues/new/choose) describing the
need to do so and update the document if necessary.
Delete section if not applicable.
-->
- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
Delete section if not applicable.
-->
---
python/pyfory/__init__.py | 3 +
python/pyfory/_registry.py | 14 ++
python/pyfory/_serialization.pyx | 9 +
python/pyfory/_struct.py | 49 +++++-
python/pyfory/_util.pxd | 2 +
python/pyfory/_util.pyx | 5 +
python/pyfory/meta/metastring.py | 17 +-
python/pyfory/meta/typedef.py | 240 +++++++++++++++++++++++++++
python/pyfory/meta/typedef_decoder.py | 185 +++++++++++++++++++++
python/pyfory/meta/typedef_encoder.py | 206 +++++++++++++++++++++++
python/pyfory/serializer.py | 2 +-
python/pyfory/tests/test_typedef_encoding.py | 147 ++++++++++++++++
python/pyfory/type.py | 17 +-
13 files changed, 887 insertions(+), 9 deletions(-)
diff --git a/python/pyfory/__init__.py b/python/pyfory/__init__.py
index 4a5e46396..7cfdf62d5 100644
--- a/python/pyfory/__init__.py
+++ b/python/pyfory/__init__.py
@@ -21,6 +21,9 @@ from pyfory._fory import ( # noqa: F401 # pylint:
disable=unused-import
Language,
)
+PYTHON = Language.PYTHON
+XLANG = Language.XLANG
+
try:
from pyfory._serialization import ENABLE_FORY_CYTHON_SERIALIZATION
except ImportError:
diff --git a/python/pyfory/_registry.py b/python/pyfory/_registry.py
index 1fe214366..05715a579 100644
--- a/python/pyfory/_registry.py
+++ b/python/pyfory/_registry.py
@@ -66,6 +66,7 @@ from pyfory.serializer import (
ObjectSerializer,
)
from pyfory.meta.metastring import MetaStringEncoder, MetaStringDecoder
+from pyfory.meta.meta_compressor import DeflaterMetaCompressor
from pyfory.type import (
TypeId,
Int8Type,
@@ -154,6 +155,7 @@ class TypeResolver:
"namespace_decoder",
"typename_encoder",
"typename_decoder",
+ "meta_compressor",
"require_registration",
"metastring_resolver",
"language",
@@ -182,6 +184,7 @@ class TypeResolver:
self.namespace_decoder = MetaStringDecoder(".", "_")
self.typename_encoder = MetaStringEncoder("$", "_")
self.typename_decoder = MetaStringDecoder("$", "_")
+ self.meta_compressor = DeflaterMetaCompressor()
def initialize(self):
self._initialize_xlang()
@@ -576,6 +579,17 @@ class TypeResolver:
else:
return self._type_id_to_typeinfo[type_id]
+ def get_typeinfo_by_id(self, type_id):
+ """Get typeinfo by type_id."""
+ return self._type_id_to_typeinfo[type_id]
+
+ def get_typeinfo_by_name(self, namespace, typename):
+ """Get typeinfo by namespace and typename."""
+ return self._named_type_to_typeinfo.get((namespace, typename))
+
+ def get_meta_compressor(self):
+ return self.meta_compressor
+
def reset(self):
pass
diff --git a/python/pyfory/_serialization.pyx b/python/pyfory/_serialization.pyx
index d98a248d5..315333a70 100644
--- a/python/pyfory/_serialization.pyx
+++ b/python/pyfory/_serialization.pyx
@@ -570,6 +570,15 @@ cdef class TypeResolver:
raise ValueError(f"Unexpected type_id {type_id}")
typeinfo = <TypeInfo> typeinfo_ptr
return typeinfo
+
+ def get_typeinfo_by_id(self, type_id):
+ return self._resolver.get_typeinfo_by_id(type_id=type_id)
+
+ def get_typeinfo_by_name(self, namespace, typename):
+ return self._resolver.get_typeinfo_by_name(namespace=namespace,
typename=typename)
+
+ def get_meta_compressor(self):
+ return self._resolver.get_meta_compressor()
cpdef inline reset(self):
pass
diff --git a/python/pyfory/_struct.py b/python/pyfory/_struct.py
index 6c455424e..4e251c1c8 100644
--- a/python/pyfory/_struct.py
+++ b/python/pyfory/_struct.py
@@ -18,6 +18,7 @@
import datetime
import enum
import logging
+import typing
from pyfory.type import (
TypeVisitor,
@@ -166,7 +167,7 @@ def _sort_fields(type_resolver, field_names, serializers):
map_types = sorted(map_types, key=sorter)
other_types = sorted(other_types, key=sorter)
all_types = boxed_types + final_types + other_types + collection_types +
map_types
- return [t[1] for t in all_types], [t[2] for t in all_types]
+ return [t[2] for t in all_types], [t[1] for t in all_types]
class StructHashVisitor(TypeVisitor):
@@ -221,3 +222,49 @@ class StructHashVisitor(TypeVisitor):
def get_hash(self):
return self._hash
+
+
+class StructTypeIdVisitor(TypeVisitor):
+ def __init__(
+ self,
+ fory,
+ ):
+ self.fory = fory
+
+ def visit_list(self, field_name, elem_type, types_path=None):
+ # Infer type recursively for type such as List[Dict[str, str]]
+ elem_ids = infer_field("item", elem_type, self, types_path=types_path)
+ return TypeId.LIST, elem_ids
+
+ def visit_dict(self, field_name, key_type, value_type, types_path=None):
+ # Infer type recursively for type such as Dict[str, Dict[str, str]]
+ key_ids = infer_field("key", key_type, self, types_path=types_path)
+ value_ids = infer_field("value", value_type, self,
types_path=types_path)
+ return TypeId.MAP, key_ids, value_ids
+
+ def visit_customized(self, field_name, type_, types_path=None):
+ return None, None
+
+ def visit_other(self, field_name, type_, types_path=None):
+ from pyfory.serializer import PickleSerializer # Local import
+
+ if is_subclass(type_, enum.Enum):
+ return self.fory.type_resolver.get_typeinfo(type_).type_id
+ if type_ not in basic_types and not is_py_array_type(type_):
+ return None, None
+ typeinfo = self.fory.type_resolver.get_typeinfo(type_)
+ assert not isinstance(typeinfo.serializer, (PickleSerializer,))
+ return [typeinfo.type_id]
+
+
+def get_field_names(clz, type_hints=None):
+ if hasattr(clz, "__dict__"):
+ # Regular object with __dict__
+ # We can't know the fields without an instance, so we rely on type
hints
+ if type_hints is None:
+ type_hints = typing.get_type_hints(clz)
+ return sorted(type_hints.keys())
+ elif hasattr(clz, "__slots__"):
+ # Object with __slots__
+ return sorted(clz.__slots__)
+ return []
diff --git a/python/pyfory/_util.pxd b/python/pyfory/_util.pxd
index 664af057c..f705800a8 100644
--- a/python/pyfory/_util.pxd
+++ b/python/pyfory/_util.pxd
@@ -94,6 +94,8 @@ cdef class Buffer:
cpdef inline write_bool(self, c_bool value)
+ cpdef inline write_uint8(self, uint8_t value)
+
cpdef inline write_int8(self, int8_t value)
cpdef inline write_int16(self, int16_t value)
diff --git a/python/pyfory/_util.pyx b/python/pyfory/_util.pyx
index 476fe30aa..5a7e68478 100644
--- a/python/pyfory/_util.pyx
+++ b/python/pyfory/_util.pyx
@@ -167,6 +167,11 @@ cdef class Buffer:
self.grow(<int32_t>1)
(<c_bool *>(self._c_address + self.writer_index))[0] = value
self.writer_index += <int32_t>1
+
+ cpdef inline write_uint8(self, uint8_t value):
+ self.grow(<int32_t>1)
+ (<uint8_t *>(self._c_address + self.writer_index))[0] = value
+ self.writer_index += <int32_t>1
cpdef inline write_int8(self, int8_t value):
self.grow(<int32_t>1)
diff --git a/python/pyfory/meta/metastring.py b/python/pyfory/meta/metastring.py
index 5fd306b74..f0dc4eca2 100644
--- a/python/pyfory/meta/metastring.py
+++ b/python/pyfory/meta/metastring.py
@@ -372,7 +372,7 @@ class MetaStringEncoder:
self.special_char2,
)
- def compute_encoding(self, input_string: str) -> Encoding:
+ def compute_encoding(self, input_string: str, encodings: List[Encoding] =
None) -> Encoding:
"""
Determines the encoding type of the input string.
@@ -384,23 +384,28 @@ class MetaStringEncoder:
"""
if not input_string:
return Encoding.LOWER_SPECIAL
+ if encodings is None:
+ encodings = list(Encoding.__members__.values())
chars = list(input_string)
statistics = self._compute_statistics(chars)
- if statistics.can_lower_special_encoded:
+ if statistics.can_lower_special_encoded and Encoding.LOWER_SPECIAL in
encodings:
return Encoding.LOWER_SPECIAL
- elif statistics.can_lower_upper_digit_special_encoded:
+ elif statistics.can_lower_upper_digit_special_encoded and
Encoding.LOWER_UPPER_DIGIT_SPECIAL in encodings:
if statistics.digit_count != 0:
return Encoding.LOWER_UPPER_DIGIT_SPECIAL
else:
upper_count = statistics.upper_count
if upper_count == 1 and chars[0].isupper():
return Encoding.FIRST_TO_LOWER_SPECIAL
- if (len(chars) + upper_count) * 5 < len(chars) * 6:
+ if (len(chars) + upper_count) * 5 < len(chars) * 6 and
Encoding.ALL_TO_LOWER_SPECIAL in encodings:
return Encoding.ALL_TO_LOWER_SPECIAL
else:
- return Encoding.LOWER_UPPER_DIGIT_SPECIAL
- return Encoding.UTF_8
+ if Encoding.LOWER_UPPER_DIGIT_SPECIAL in encodings:
+ return Encoding.LOWER_UPPER_DIGIT_SPECIAL
+ if Encoding.UTF_8 in encodings:
+ return Encoding.UTF_8
+ raise ValueError(f"No encoding found for string: {input_string},
encodings: {encodings}")
def _compute_statistics(self, chars: List[str]) -> Statistics:
"""
diff --git a/python/pyfory/meta/typedef.py b/python/pyfory/meta/typedef.py
new file mode 100644
index 000000000..dd786e990
--- /dev/null
+++ b/python/pyfory/meta/typedef.py
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import List
+import typing
+from pyfory.type import TypeId
+from pyfory._util import Buffer
+from pyfory.serializer import MapSerializer, ListSerializer, SetSerializer
+from pyfory._struct import _sort_fields, StructTypeIdVisitor, get_field_names
+from pyfory.type import TypeId, infer_field, is_primitive_type,
is_polymorphic_type
+from pyfory.meta.metastring import Encoding
+
+
+# Constants from the specification
+SMALL_NUM_FIELDS_THRESHOLD = 0b11111
+REGISTER_BY_NAME_FLAG = 0b100000
+FIELD_NAME_SIZE_THRESHOLD = 0b1111
+COMPRESS_META_FLAG = 0b1 << 13
+HAS_FIELDS_META_FLAG = 0b1 << 12
+META_SIZE_MASKS = 0xFFF
+NUM_HASH_BITS = 50
+
+# Field name encoding constants
+FIELD_NAME_ENCODING_UTF8 = 0b00
+FIELD_NAME_ENCODING_ALL_TO_LOWER_SPECIAL = 0b01
+FIELD_NAME_ENCODING_LOWER_UPPER_DIGIT_SPECIAL = 0b10
+FIELD_NAME_ENCODING_TAG_ID = 0b11
+FIELD_NAME_ENCODINGS = [Encoding.UTF_8, Encoding.LOWER_UPPER_DIGIT_SPECIAL,
Encoding.ALL_TO_LOWER_SPECIAL]
+
+
+class TypeDef:
+ def __init__(self, name: str, type_id: int, fields: List["FieldInfo"],
encoded: bytes = None, is_compressed: bool = False):
+ self.name = name
+ self.type_id = type_id
+ self.fields = fields
+ self.encoded = encoded
+ self.is_compressed = is_compressed
+
+ def create_fields_serializer(self, resolver):
+ serializers = [field_info.field_type.create_serializer(resolver) for
field_info in self.fields]
+ return serializers
+
+ def __repr__(self):
+ return f"TypeDef(name={self.name}, type_id={self.type_id},
fields={self.fields}, is_compressed={self.is_compressed})"
+
+
+class FieldInfo:
+ def __init__(self, name: str, field_type: "FieldType", defined_class: str):
+ self.name = name
+ self.field_type = field_type
+ self.defined_class = defined_class
+
+ def xwrite(self, buffer: Buffer):
+ self.field_type.xwrite(buffer, True)
+
+ @classmethod
+ def xread(cls, buffer: Buffer, resolver):
+ field_type = FieldType.xread(buffer, resolver)
+ # Note: name and defined_class would need to be read from the buffer
+ # This is a simplified version
+ return cls("", field_type, "")
+
+ def __repr__(self):
+ return f"FieldInfo(name={self.name}, field_type={self.field_type},
defined_class={self.defined_class})"
+
+
+class FieldType:
+ def __init__(self, type_id: int, is_monomorphic: bool, is_nullable: bool,
is_tracking_ref: bool):
+ self.type_id = type_id
+ self.is_monomorphic = is_monomorphic
+ self.is_nullable = is_nullable
+ self.is_tracking_ref = is_tracking_ref
+
+ def xwrite(self, buffer: Buffer, write_flags: bool = True):
+ xtype_id = self.type_id
+ if write_flags:
+ xtype_id = xtype_id << 2
+ if self.is_nullable:
+ xtype_id |= 0b10
+ if self.is_tracking_ref:
+ xtype_id |= 0b1
+ buffer.write_varuint32(xtype_id)
+ # Handle nested types
+ if self.type_id in [TypeId.LIST, TypeId.SET]:
+ self.element_type.xwrite(buffer, True)
+ elif self.type_id == TypeId.MAP:
+ self.key_type.xwrite(buffer, True)
+ self.value_type.xwrite(buffer, True)
+
+ @classmethod
+ def xread(cls, buffer: Buffer, resolver):
+ xtype_id = buffer.read_varuint32()
+ is_tracking_ref = (xtype_id & 0b1) != 0
+ is_nullable = (xtype_id & 0b10) != 0
+ xtype_id = xtype_id >> 2
+ return cls.xread_with_type(buffer, resolver, xtype_id, is_nullable,
is_tracking_ref)
+
+ @classmethod
+ def xread_with_type(cls, buffer: Buffer, resolver, xtype_id: int,
is_nullable: bool, is_tracking_ref: bool):
+ if xtype_id in [TypeId.LIST, TypeId.SET]:
+ element_type = cls.xread(buffer, resolver)
+ return CollectionFieldType(xtype_id, True, is_nullable,
is_tracking_ref, element_type)
+ elif xtype_id == TypeId.MAP:
+ key_type = cls.xread(buffer, resolver)
+ value_type = cls.xread(buffer, resolver)
+ return MapFieldType(xtype_id, True, is_nullable, is_tracking_ref,
key_type, value_type)
+ elif xtype_id == TypeId.UNKNOWN:
+ return DynamicFieldType(xtype_id, False, is_nullable,
is_tracking_ref)
+ else:
+ return FieldType(xtype_id, False, is_nullable, is_tracking_ref)
+
+ def create_serializer(self, resolver):
+ if self.type_id in [TypeId.EXT, TypeId.STRUCT, TypeId.NAMED_STRUCT,
TypeId.COMPATIBLE_STRUCT, TypeId.NAMED_COMPATIBLE_STRUCT, TypeId.UNKNOWN]:
+ return None
+ return resolver.get_typeinfo_by_id(self.type_id).serializer
+
+ def __repr__(self):
+ return f"FieldType(type_id={self.type_id},
is_monomorphic={self.is_monomorphic}, is_nullable={self.is_nullable},
is_tracking_ref={self.is_tracking_ref})"
+
+
+class CollectionFieldType(FieldType):
+ def __init__(
+ self,
+ type_id: int,
+ is_monomorphic: bool,
+ is_nullable: bool,
+ is_tracking_ref: bool,
+ element_type: FieldType,
+ ):
+ super().__init__(type_id, is_monomorphic, is_nullable, is_tracking_ref)
+ self.element_type = element_type
+
+ def create_serializer(self, resolver):
+ if self.type_id == TypeId.LIST:
+ return ListSerializer(resolver.fory, list,
self.element_type.create_serializer(resolver))
+ elif self.type_id == TypeId.SET:
+ return SetSerializer(resolver.fory, set,
self.element_type.create_serializer(resolver))
+ else:
+ raise ValueError(f"Unknown collection type: {self.type_id}")
+
+
+class MapFieldType(FieldType):
+ def __init__(
+ self,
+ type_id: int,
+ is_monomorphic: bool,
+ is_nullable: bool,
+ is_tracking_ref: bool,
+ key_type: FieldType,
+ value_type: FieldType,
+ ):
+ super().__init__(type_id, is_monomorphic, is_nullable, is_tracking_ref)
+ self.key_type = key_type
+ self.value_type = value_type
+
+ def create_serializer(self, resolver):
+ key_serializer = self.key_type.create_serializer(resolver)
+ value_serializer = self.value_type.create_serializer(resolver)
+ return MapSerializer(resolver.fory, dict, key_serializer,
value_serializer)
+
+ def __repr__(self):
+ return (
+ f"MapFieldType(type_id={self.type_id},
is_monomorphic={self.is_monomorphic}, is_nullable={self.is_nullable}, "
+ f"is_tracking_ref={self.is_tracking_ref},
key_type={self.key_type}, value_type={self.value_type})"
+ )
+
+
+class DynamicFieldType(FieldType):
+ def __init__(self, type_id: int, is_monomorphic: bool, is_nullable: bool,
is_tracking_ref: bool):
+ super().__init__(type_id, is_monomorphic, is_nullable, is_tracking_ref)
+
+ def create_serializer(self, resolver):
+ return None
+
+ def __repr__(self):
+ return f"DynamicFieldType(type_id={self.type_id},
is_monomorphic={self.is_monomorphic}, is_nullable={self.is_nullable},
is_tracking_ref={self.is_tracking_ref})"
+
+
+def build_field_infos(type_resolver, cls):
+ """Build field information for the class."""
+ field_names = get_field_names(cls)
+ type_hints = typing.get_type_hints(cls)
+
+ field_infos = []
+ visitor = StructTypeIdVisitor(type_resolver.fory)
+
+ for field_name in field_names:
+ field_type_hint = type_hints.get(field_name, typing.Any)
+ field_type = build_field_type(type_resolver, field_name,
field_type_hint, visitor)
+ field_info = FieldInfo(field_name, field_type, cls.__name__)
+ field_infos.append(field_info)
+
+ serializers = [field_info.field_type.create_serializer(type_resolver) for
field_info in field_infos]
+ field_names, serializers = _sort_fields(type_resolver, field_names,
serializers)
+ field_infos_map = {field_info.name: field_info for field_info in
field_infos}
+ new_field_infos = []
+ for field_name in field_names:
+ field_info = field_infos_map[field_name]
+ new_field_infos.append(field_info)
+ return new_field_infos
+
+
+def build_field_type(type_resolver, field_name: str, type_hint, visitor):
+ """Build field type from type hint."""
+ type_ids = infer_field(field_name, type_hint, visitor)
+ return build_field_type_from_type_ids(type_resolver, field_name, type_ids,
visitor)
+
+
+def build_field_type_from_type_ids(type_resolver, field_name: str, type_ids,
visitor):
+ tracking_ref = type_resolver.fory.ref_tracking
+ type_id = type_ids[0]
+ morphic = not is_polymorphic_type(type_id)
+ if type_id in [TypeId.SET, TypeId.LIST]:
+ elem_type = build_field_type_from_type_ids(type_resolver, field_name,
type_ids[1], visitor)
+ return CollectionFieldType(type_id, morphic, True, tracking_ref,
elem_type)
+ elif type_id == TypeId.MAP:
+ key_type = build_field_type_from_type_ids(type_resolver, field_name,
type_ids[1], visitor)
+ value_type = build_field_type_from_type_ids(type_resolver, field_name,
type_ids[2], visitor)
+ return MapFieldType(type_id, morphic, True, tracking_ref, key_type,
value_type)
+ elif type_id in [TypeId.UNKNOWN, TypeId.EXT, TypeId.STRUCT,
TypeId.NAMED_STRUCT, TypeId.COMPATIBLE_STRUCT, TypeId.NAMED_COMPATIBLE_STRUCT]:
+ return DynamicFieldType(type_id, False, True, tracking_ref)
+ else:
+ assert is_primitive_type(type_id) or type_id in [TypeId.STRING,
TypeId.ENUM, TypeId.NAMED_ENUM], (
+ f"Unknown type: {type_id} for field: {field_name}"
+ )
+ return FieldType(type_id, morphic, True, tracking_ref)
diff --git a/python/pyfory/meta/typedef_decoder.py
b/python/pyfory/meta/typedef_decoder.py
new file mode 100644
index 000000000..18f6cdbb4
--- /dev/null
+++ b/python/pyfory/meta/typedef_decoder.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+TypeDef decoder for xlang serialization.
+
+This module implements the decoding of TypeDef objects according to the xlang
serialization specification.
+"""
+
+from typing import List
+from pyfory._util import Buffer
+from pyfory.meta.typedef import TypeDef, FieldInfo, FieldType
+from pyfory.meta.typedef import (
+ FieldInfo,
+ TypeDef,
+ SMALL_NUM_FIELDS_THRESHOLD,
+ REGISTER_BY_NAME_FLAG,
+ FIELD_NAME_SIZE_THRESHOLD,
+ COMPRESS_META_FLAG,
+ HAS_FIELDS_META_FLAG,
+ META_SIZE_MASKS,
+ NUM_HASH_BITS,
+ FIELD_NAME_ENCODINGS,
+)
+from pyfory.type import TypeId
+from pyfory.meta.metastring import MetaStringDecoder, Encoding
+
+
+# Meta string decoders
+NAMESPACE_DECODER = MetaStringDecoder(".", "_")
+TYPENAME_DECODER = MetaStringDecoder("$", "_")
+FIELD_NAME_DECODER = MetaStringDecoder("$", "_")
+
+
+def decode_typedef(buffer: Buffer, resolver) -> TypeDef:
+ """
+ Decode a TypeDef from the buffer.
+
+ Args:
+ buffer: The buffer containing the encoded TypeDef.
+ resolver: The type resolver.
+
+ Returns:
+ The decoded TypeDef.
+ """
+ # Read global binary header
+ header = buffer.read_int64()
+
+ # Extract components from header
+ meta_size = header & META_SIZE_MASKS
+ has_fields_meta = (header & HAS_FIELDS_META_FLAG) != 0
+ is_compressed = (header & COMPRESS_META_FLAG) != 0
+
+ # If meta size is at maximum, read additional size
+ if meta_size == META_SIZE_MASKS:
+ meta_size += buffer.read_varuint32()
+
+ # Read meta data
+ meta_data = buffer.read_bytes(meta_size)
+
+ # Decompress if needed
+ if is_compressed:
+ meta_data = resolver.get_meta_compressor().decompress(meta_data)
+
+ # Create a new buffer for meta data
+ meta_buffer = Buffer(meta_data)
+
+ # Read meta header
+ meta_header = meta_buffer.read_uint8()
+
+ # Extract number of fields
+ num_fields = meta_header & 0b11111
+ if num_fields == SMALL_NUM_FIELDS_THRESHOLD:
+ num_fields += meta_buffer.read_varuint32()
+
+ # Check if registered by name
+ is_registered_by_name = (meta_header & REGISTER_BY_NAME_FLAG) != 0
+
+ # Read type info
+ if is_registered_by_name:
+ namespace = read_namespace(meta_buffer)
+ typename = read_typename(meta_buffer)
+ name = namespace + "." + typename if namespace else typename
+ # Look up the type_id from namespace and typename
+ type_info = resolver.get_typeinfo_by_name(namespace, typename)
+ if type_info:
+ type_id = type_info.type_id
+ else:
+ # Fallback to COMPATIBLE_STRUCT if not found
+ type_id = TypeId.COMPATIBLE_STRUCT
+ else:
+ type_id = meta_buffer.read_varuint32()
+ type_info = resolver.get_typeinfo_by_id(type_id)
+ name = type_info.cls.__name__
+
+ # Read fields info if present
+ field_infos = []
+ if has_fields_meta:
+ field_infos = read_fields_info(meta_buffer, resolver, name, num_fields)
+
+ # Create TypeDef object
+ return TypeDef(name, type_id, field_infos, meta_data, is_compressed)
+
+
+def read_namespace(buffer: Buffer) -> str:
+ """Read namespace from the buffer."""
+ return read_meta_string(buffer, NAMESPACE_DECODER)
+
+
+def read_typename(buffer: Buffer) -> str:
+ """Read typename from the buffer."""
+ return read_meta_string(buffer, TYPENAME_DECODER)
+
+
+def read_meta_string(buffer: Buffer, decoder: MetaStringDecoder) -> str:
+ """Read a meta string from the buffer."""
+ # Read encoding and length combined in first byte
+ header = buffer.read_uint8()
+
+ # Extract encoding (2 bits) and size (6 bits)
+ encoding_value = header & 0b11
+ size_value = (header >> 2) & 0b111111
+
+ encoding = Encoding(encoding_value)
+
+ # Read length - same logic as encoder
+ length = 0
+ if size_value >= FIELD_NAME_SIZE_THRESHOLD:
+ length = size_value - FIELD_NAME_SIZE_THRESHOLD +
buffer.read_varuint32()
+ else:
+ length = size_value
+
+ # Read encoded data
+ if length > 0:
+ encoded_data = buffer.read_bytes(length)
+ return decoder.decode(encoded_data, encoding)
+ else:
+ return ""
+
+
+def read_fields_info(buffer: Buffer, resolver, defined_class: str, num_fields:
int) -> List[FieldInfo]:
+ """Read field information from the buffer."""
+ field_infos = []
+ for _ in range(num_fields):
+ field_info = read_field_info(buffer, resolver, defined_class)
+ field_infos.append(field_info)
+ return field_infos
+
+
+def read_field_info(buffer: Buffer, resolver, defined_class: str) -> FieldInfo:
+ """Read a single field info from the buffer."""
+ # Read field header
+ header = buffer.read_uint8()
+
+ # Extract field header components
+ field_name_encoding = (header >> 6) & 0b11
+ field_name_size = (header >> 2) & 0b1111
+ if field_name_size == FIELD_NAME_SIZE_THRESHOLD:
+ field_name_size += buffer.read_varuint32()
+ field_name_size += 1
+ encoding = FIELD_NAME_ENCODINGS[field_name_encoding]
+ is_nullable = (header & 0b10) != 0
+ is_tracking_ref = header & 0b1
+
+ # Read field type info (without flags since they're in the header)
+ xtype_id = buffer.read_varuint32()
+ field_type = FieldType.xread_with_type(buffer, resolver, xtype_id,
is_nullable, is_tracking_ref)
+
+ # Read field name
+ field_name = FIELD_NAME_DECODER.decode(buffer.read_bytes(field_name_size),
encoding)
+ return FieldInfo(field_name, field_type, defined_class)
diff --git a/python/pyfory/meta/typedef_encoder.py
b/python/pyfory/meta/typedef_encoder.py
new file mode 100644
index 000000000..f652fc40f
--- /dev/null
+++ b/python/pyfory/meta/typedef_encoder.py
@@ -0,0 +1,206 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections import Counter
+
+from pyfory.meta.typedef import (
+ FieldInfo,
+ TypeDef,
+ build_field_infos,
+ SMALL_NUM_FIELDS_THRESHOLD,
+ REGISTER_BY_NAME_FLAG,
+ FIELD_NAME_SIZE_THRESHOLD,
+ COMPRESS_META_FLAG,
+ HAS_FIELDS_META_FLAG,
+ META_SIZE_MASKS,
+ NUM_HASH_BITS,
+ FIELD_NAME_ENCODINGS,
+)
+from pyfory.meta.metastring import MetaStringEncoder
+
+from pyfory._util import Buffer
+from pyfory.type import TypeId
+from pyfory.lib.mmh3 import hash_buffer
+
+
+# Meta string encoders
+NAMESPACE_ENCODER = MetaStringEncoder(".", "_")
+TYPENAME_ENCODER = MetaStringEncoder("$", "_")
+FIELD_NAME_ENCODER = MetaStringEncoder("$", "_")
+
+
+def encode_typedef(type_resolver, cls):
+ """
+ Encode the typedef of the type for xlang serialization.
+
+ Args:
+ type_resolver: The type resolver.
+ cls: The class to encode.
+
+ Returns:
+ The encoded TypeDef.
+ """
+ field_infos = build_field_infos(type_resolver, cls)
+
+ # Check for duplicate field names
+ field_names = [field_info.name for field_info in field_infos]
+ duplicate_field_names = [name for name, count in
Counter(field_names).items() if count > 1]
+ if duplicate_field_names:
+ # TODO: handle duplicate field names for inheritance in future
+ raise ValueError(f"Duplicate field names: {duplicate_field_names}")
+
+ buffer = Buffer.allocate(64)
+
+ # Write placeholder for header
+ buffer.write_uint8(0)
+
+ # Write meta header
+ header = len(field_infos)
+ if len(field_infos) >= SMALL_NUM_FIELDS_THRESHOLD:
+ header = SMALL_NUM_FIELDS_THRESHOLD
+ buffer.write_varuint32(len(field_infos) - SMALL_NUM_FIELDS_THRESHOLD)
+
+ # Write type info
+ type_info = type_resolver.get_typeinfo(cls)
+ assert type_info.type_id > 0
+
+ if not TypeId.is_namespaced_type(type_info.type_id):
+ buffer.write_varuint32(type_info.type_id)
+ else:
+ header |= REGISTER_BY_NAME_FLAG
+ namespace = type_info.decode_namespace()
+ typename = type_info.decode_typename()
+ write_namespace(buffer, namespace)
+ write_typename(buffer, typename)
+
+ # Update header byte
+ buffer.put_uint8(0, header)
+
+ # Write fields info
+ write_fields_info(type_resolver, buffer, field_infos)
+
+ # Get the encoded binary
+ binary = buffer.to_bytes()
+
+ # Compress if beneficial
+ compressed_binary = type_resolver.get_meta_compressor().compress(binary)
+ is_compressed = len(compressed_binary) < len(binary)
+ if is_compressed:
+ binary = compressed_binary
+ # Prepend header
+ binary = prepend_header(binary, is_compressed, len(field_infos) > 0)
+ return TypeDef(cls.__name__, type_info.type_id, field_infos, binary,
is_compressed)
+
+
+def prepend_header(buffer: bytes, is_compressed: bool, has_fields_meta: bool):
+ """Prepend header to the buffer."""
+ meta_size = len(buffer)
+ hash = hash_buffer(buffer, 47)[0]
+ hash <<= 64 - NUM_HASH_BITS
+ header = abs(hash) & 0x7FFFFFFFFFFFFFFF # Ensure it fits in 63 bits
+ if is_compressed:
+ header |= COMPRESS_META_FLAG
+
+ if has_fields_meta:
+ header |= HAS_FIELDS_META_FLAG
+
+ header |= min(meta_size, META_SIZE_MASKS)
+ result = Buffer.allocate(meta_size + 8)
+ result.write_int64(header)
+ if meta_size > META_SIZE_MASKS:
+ result.write_varuint32(meta_size - META_SIZE_MASKS)
+
+ result.write_bytes(buffer)
+ return result
+
+
+def write_namespace(buffer: Buffer, namespace: str):
+ """Write namespace using meta string encoding."""
+ # - Package name encoding(omitted when class is registered):
+ # - encoding algorithm:
`UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL`
+ # - Header: `6 bits size | 2 bits encoding flags`.
+ # The `6 bits size: 0~63` will be used to indicate size `0~62`,
+ # the value `63` the size need more byte to read, the encoding will
encode `size - 62` as a varint next.
+ meta_string = NAMESPACE_ENCODER.encode(namespace)
+ write_meta_string(buffer, meta_string)
+
+
+def write_typename(buffer: Buffer, typename: str):
+ """Write typename using meta string encoding."""
+ # - Class name encoding(omitted when class is registered):
+ # - encoding algorithm:
+ #
`UTF8/LOWER_UPPER_DIGIT_SPECIAL/FIRST_TO_LOWER_SPECIAL/ALL_TO_LOWER_SPECIAL`
+ # - header: `6 bits size | 2 bits encoding flags`.
+ # The `6 bits size: 0~63` will be used to indicate size `1~64`,
+ # the value `63` the size need more byte to read, the encoding will
encode `size - 63` as a varint next.
+ meta_string = TYPENAME_ENCODER.encode(typename)
+ write_meta_string(buffer, meta_string)
+
+
+def write_meta_string(buffer: Buffer, meta_string):
+ """Write a meta string to the buffer."""
+ # Write encoding and length combined in first byte
+ length = len(meta_string.encoded_data)
+ encoding_value = meta_string.encoding.value
+
+ if length >= FIELD_NAME_SIZE_THRESHOLD:
+ # Use threshold value and write additional length
+ header = (FIELD_NAME_SIZE_THRESHOLD << 2) | encoding_value
+ buffer.write_uint8(header)
+ buffer.write_varuint32(length - FIELD_NAME_SIZE_THRESHOLD)
+ else:
+ # Combine length and encoding in single byte
+ header = (length << 2) | encoding_value
+ buffer.write_uint8(header)
+
+ # Write encoded data
+ if meta_string.encoded_data:
+ buffer.write_bytes(meta_string.encoded_data)
+
+
+def write_fields_info(type_resolver, buffer: Buffer, field_infos: list):
+ """Write field information to the buffer."""
+ for field_info in field_infos:
+ write_field_info(buffer, field_info)
+
+
+def write_field_info(buffer: Buffer, field_info: FieldInfo):
+ """Write a single field info to the buffer."""
+ # header: 2 bits field name encoding + 4 bits size + nullability flag +
ref tracking flag
+ header = 0
+ if field_info.field_type.is_nullable:
+ header |= 0b10
+ if field_info.field_type.is_tracking_ref:
+ header |= 0b1
+ encoding = FIELD_NAME_ENCODER.compute_encoding(field_info.name,
FIELD_NAME_ENCODINGS)
+ meta_string = FIELD_NAME_ENCODER.encode_with_encoding(field_info.name,
encoding)
+ field_name_binary_size = len(meta_string.encoded_data) - 1
+ encoding_flags = FIELD_NAME_ENCODINGS.index(meta_string.encoding)
+ header |= encoding_flags << 6
+ if field_name_binary_size >= FIELD_NAME_SIZE_THRESHOLD:
+ header |= 0b00111100
+ buffer.write_uint8(header)
+ buffer.write_varuint32(field_name_binary_size -
FIELD_NAME_SIZE_THRESHOLD)
+ else:
+ header |= field_name_binary_size << 2
+ buffer.write_uint8(header)
+
+ # Write field type info
+ field_info.field_type.xwrite(buffer, False)
+
+ # TODO: support tag id
+ buffer.write_bytes(meta_string.encoded_data)
diff --git a/python/pyfory/serializer.py b/python/pyfory/serializer.py
index aba3809c4..0018232d9 100644
--- a/python/pyfory/serializer.py
+++ b/python/pyfory/serializer.py
@@ -311,7 +311,7 @@ class DataClassSerializer(Serializer):
for index, key in enumerate(self._field_names):
serializer = infer_field(key, self._type_hints[key], visitor,
types_path=[])
self._serializers[index] = serializer
- self._serializers, self._field_names =
_sort_fields(fory.type_resolver, self._field_names, self._serializers)
+ self._field_names, self._serializers =
_sort_fields(fory.type_resolver, self._field_names, self._serializers)
self._hash = 0 # Will be computed on first xwrite/xread
self._generated_xwrite_method = self._gen_xwrite_method()
self._generated_xread_method = self._gen_xread_method()
diff --git a/python/pyfory/tests/test_typedef_encoding.py
b/python/pyfory/tests/test_typedef_encoding.py
new file mode 100644
index 000000000..70ad35182
--- /dev/null
+++ b/python/pyfory/tests/test_typedef_encoding.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Tests for xlang TypeDef implementation.
+"""
+
+from dataclasses import dataclass
+from typing import List, Dict
+import pyfory
+from pyfory._util import Buffer
+from pyfory.meta.typedef import TypeDef, FieldInfo, FieldType,
CollectionFieldType, MapFieldType, DynamicFieldType
+from pyfory.meta.typedef_encoder import encode_typedef
+from pyfory.meta.typedef_decoder import decode_typedef
+from pyfory.type import TypeId
+from pyfory import Fory
+
+
+@dataclass
+class TestTypeDef:
+ """Test class for TypeDef functionality."""
+
+ name: str
+ age: int
+ scores: List[float]
+ metadata: Dict[str, str]
+
+
+@dataclass
+class SimpleTypeDef:
+ """Simple test class."""
+
+ value: int
+
+
+def test_collection_field_type():
+ """Test collection field type creation and serialization."""
+ element_type = FieldType(TypeId.INT32, True, True, False)
+ list_field = CollectionFieldType(TypeId.LIST, True, True, False,
element_type)
+
+ assert list_field.type_id == TypeId.LIST
+ assert list_field.element_type == element_type
+ assert list_field.is_nullable
+
+
+def test_map_field_type():
+ """Test map field type creation and serialization."""
+ key_type = FieldType(TypeId.STRING, True, True, False)
+ value_type = FieldType(TypeId.INT32, True, True, False)
+ map_field = MapFieldType(TypeId.MAP, True, True, False, key_type,
value_type)
+
+ assert map_field.type_id == TypeId.MAP
+ assert map_field.key_type == key_type
+ assert map_field.value_type == value_type
+
+
+def test_typedef_creation():
+ """Test TypeDef creation."""
+ fields = [
+ FieldInfo("name", FieldType(TypeId.STRING, True, True, False),
"TestTypeDef"),
+ FieldInfo("age", FieldType(TypeId.INT32, True, True, False),
"TestTypeDef"),
+ ]
+
+ typedef = TypeDef("TestTypeDef", TypeId.STRUCT, fields, b"encoded_data",
False)
+
+ assert typedef.name == "TestTypeDef"
+ assert typedef.type_id == TypeId.STRUCT
+ assert len(typedef.fields) == 2
+ assert typedef.encoded == b"encoded_data"
+ assert typedef.is_compressed is False
+
+
+def test_field_info_creation():
+ """Test FieldInfo creation."""
+ field_type = FieldType(TypeId.STRING, True, True, False)
+ field_info = FieldInfo("test_field", field_type, "TestClass")
+
+ assert field_info.name == "test_field"
+ assert field_info.field_type == field_type
+ assert field_info.defined_class == "TestClass"
+
+
+def test_dynamic_field_type():
+ """Test dynamic field type."""
+ dynamic_field = DynamicFieldType(TypeId.EXT, False, True, False)
+
+ assert dynamic_field.type_id == TypeId.EXT
+ assert dynamic_field.is_monomorphic is False
+ assert dynamic_field.is_nullable
+ assert dynamic_field.is_tracking_ref is False
+
+
+def test_encode_decode_typedef():
+ """Test encoding and decoding a TypeDef."""
+ fory = Fory(language=pyfory.XLANG)
+ fory.register(SimpleTypeDef, namespace="example", typename="SimpleTypeDef")
+ fory.register(TestTypeDef, namespace="example", typename="TestTypeDef")
+ # Create a mock resolver
+ resolver = fory.type_resolver
+
+ types = [SimpleTypeDef, TestTypeDef]
+ for type_ in types:
+ # Encode a TypeDef
+ typedef = encode_typedef(resolver, type_)
+ print(f"typedef: {typedef}")
+
+ # Create a buffer from the encoded data
+ buffer = Buffer(typedef.encoded)
+
+ # Decode the TypeDef
+ decoded_typedef = decode_typedef(buffer, resolver)
+ print(f"decoded_typedef: {decoded_typedef}")
+
+ # Verify the decoded TypeDef has the expected properties
+ assert decoded_typedef.type_id == typedef.type_id
+ assert decoded_typedef.is_compressed == typedef.is_compressed
+ assert len(decoded_typedef.fields) == len(typedef.fields)
+
+ # Verify field names match
+ for i, field in enumerate(decoded_typedef.fields):
+ assert field.name == typedef.fields[i].name
+ assert field.field_type.type_id ==
typedef.fields[i].field_type.type_id
+ assert field.field_type.is_nullable ==
typedef.fields[i].field_type.is_nullable
+
+
+if __name__ == "__main__":
+ test_collection_field_type()
+ test_map_field_type()
+ test_typedef_creation()
+ test_field_info_creation()
+ test_dynamic_field_type()
+ test_encode_decode_typedef()
+ print("All basic tests passed!")
diff --git a/python/pyfory/type.py b/python/pyfory/type.py
index add96c783..7018f504f 100644
--- a/python/pyfory/type.py
+++ b/python/pyfory/type.py
@@ -129,7 +129,7 @@ class TypeId:
Fory type for cross-language serialization.
See `org.apache.fory.types.Type`
"""
-
+ UNKNOWN = -1
# null value
NA = 0
# a boolean value (true or false).
@@ -356,6 +356,21 @@ def is_map_type(type_):
return issubclass(type_, typing.Dict)
except TypeError:
return False
+
+
+_polymorphic_type_ids = {
+ TypeId.STRUCT,
+ TypeId.COMPATIBLE_STRUCT,
+ TypeId.NAMED_STRUCT,
+ TypeId.NAMED_COMPATIBLE_STRUCT,
+ TypeId.EXT,
+ TypeId.NAMED_EXT,
+ TypeId.UNKNOWN,
+}
+
+
+def is_polymorphic_type(type_id: int) -> bool:
+ return type_id in _polymorphic_type_ids
def is_subclass(from_type, to_type):
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]