ShravanSunder commented on issue #3293: URL: https://github.com/apache/arrow-adbc/issues/3293#issuecomment-3192579849
My work around for jsonb and for pgvector (vector) is below ``` import json import struct import typing as t import pyarrow as pa def encode_pgvector_column(vectors: t.List[t.List[float] | None]) -> pa.BinaryArray: """Encode vectors to pgvector binary format for ADBC. Args: vectors: List of float vectors or None values Returns: PyArrow array with binary-encoded vectors for pgvector """ def encode_single(vec: t.List[float] | None) -> bytes | None: if vec is None: return None dim = len(vec) header = struct.pack(">HH", dim, 0) payload = struct.pack(f">{dim}f", *vec) if dim else b"" return header + payload encoded = [encode_single(v) for v in vectors] return pa.array(encoded, type=pa.binary()) def encode_jsonb_column(json_data: t.List[t.Any]) -> pa.BinaryArray: """Encode JSON data to PostgreSQL JSONB binary format for ADBC. PostgreSQL JSONB binary format: 0x01 (version byte) + UTF-8 encoded JSON Args: json_data: List of JSON-serializable objects (dicts, lists, strings from extension<arrow.json>) Returns: PyArrow array with binary-encoded JSONB for PostgreSQL """ encoded: t.List[bytes | None] = [] for item in json_data: if item is None: encoded.append(None) else: # If it's already a JSON string (from extension<arrow.json>), use it directly # Otherwise serialize to JSON if isinstance(item, str): json_str = item else: # Compact JSON (no extra whitespace) json_str = json.dumps(item, separators=(",", ":"), ensure_ascii=False) # JSONB binary format: version byte 0x01 + UTF-8 JSON encoded.append(b"\x01" + json_str.encode("utf-8")) return pa.array(encoded, type=pa.binary()) def _auto_detect_embedding_columns(table: pa.Table) -> t.List[str]: """Detect columns that look like embeddings (list of floats). Args: table: Arrow table to inspect Returns: List of column names that are list[float] types """ embedding_cols: t.List[str] = [] for i in range(len(table.schema)): field: pa.Field[t.Any] = table.schema.field(i) # pyright: ignore[reportUnknownVariableType] # Check if it's a list of float32 or float64 if pa.types.is_list(field.type): value_type = field.type.value_type if pa.types.is_float32(value_type) or pa.types.is_float64(value_type): embedding_cols.append(field.name) return embedding_cols def _auto_detect_json_extension_columns(table: pa.Table) -> t.List[str]: """Detect columns with extension<arrow.json> type. Args: table: Arrow table to inspect Returns: List of column names that have extension<arrow.json> type """ json_cols: t.List[str] = [] for i in range(len(table.schema)): field = table.schema.field(i) # pyright: ignore[reportUnknownVariableType] # Check for JSON extension type if str(field.type) == "extension<arrow.json>": # pyright: ignore[reportUnknownArgumentType] json_cols.append(field.name) # pyright: ignore[reportUnknownMemberType] return json_cols def prepare_arrow_table_for_pg_adbc( table: pa.Table, ) -> pa.Table: """Prepare Arrow table for PostgreSQL ADBC ingestion. Automatically detects and fixes compatibility issues: 1. List[float] columns → bytes (for pgvector VECTOR type) 2. extension<arrow.json> columns → bytes with 0x01 prefix (for PostgreSQL JSONB type) Args: table: Arrow table to prepare for PostgreSQL Returns: Modified Arrow table compatible with PostgreSQL ADBC driver """ # Auto-detect embedding columns embedding_columns = _auto_detect_embedding_columns(table) # Auto-detect JSON extension columns jsonb_columns = _auto_detect_json_extension_columns(table) # Apply pgvector transformation for embedding columns for col_name in embedding_columns: if col_name in table.column_names: vectors = table[col_name].to_pylist() encoded_array = encode_pgvector_column(vectors) col_idx = table.schema.get_field_index(col_name) table = table.set_column(col_idx, col_name, encoded_array) # Apply JSONB binary encoding for JSON extension columns for col_name in jsonb_columns: if col_name in table.column_names: col_idx = table.schema.get_field_index(col_name) # Get the JSON data as Python objects/strings json_data = table[col_name].to_pylist() # Encode to JSONB binary format encoded_array = encode_jsonb_column(json_data) table = table.set_column(col_idx, col_name, encoded_array) return table ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org