ShravanSunder commented on issue #3293:
URL: https://github.com/apache/arrow-adbc/issues/3293#issuecomment-3192579849
My work around for jsonb and for pgvector (vector) is below
```
import json
import struct
import typing as t
import pyarrow as pa
def encode_pgvector_column(vectors: t.List[t.List[float] | None]) ->
pa.BinaryArray:
"""Encode vectors to pgvector binary format for ADBC.
Args:
vectors: List of float vectors or None values
Returns:
PyArrow array with binary-encoded vectors for pgvector
"""
def encode_single(vec: t.List[float] | None) -> bytes | None:
if vec is None:
return None
dim = len(vec)
header = struct.pack(">HH", dim, 0)
payload = struct.pack(f">{dim}f", *vec) if dim else b""
return header + payload
encoded = [encode_single(v) for v in vectors]
return pa.array(encoded, type=pa.binary())
def encode_jsonb_column(json_data: t.List[t.Any]) -> pa.BinaryArray:
"""Encode JSON data to PostgreSQL JSONB binary format for ADBC.
PostgreSQL JSONB binary format: 0x01 (version byte) + UTF-8 encoded JSON
Args:
json_data: List of JSON-serializable objects (dicts, lists, strings
from extension<arrow.json>)
Returns:
PyArrow array with binary-encoded JSONB for PostgreSQL
"""
encoded: t.List[bytes | None] = []
for item in json_data:
if item is None:
encoded.append(None)
else:
# If it's already a JSON string (from extension<arrow.json>),
use it directly
# Otherwise serialize to JSON
if isinstance(item, str):
json_str = item
else:
# Compact JSON (no extra whitespace)
json_str = json.dumps(item, separators=(",", ":"),
ensure_ascii=False)
# JSONB binary format: version byte 0x01 + UTF-8 JSON
encoded.append(b"\x01" + json_str.encode("utf-8"))
return pa.array(encoded, type=pa.binary())
def _auto_detect_embedding_columns(table: pa.Table) -> t.List[str]:
"""Detect columns that look like embeddings (list of floats).
Args:
table: Arrow table to inspect
Returns:
List of column names that are list[float] types
"""
embedding_cols: t.List[str] = []
for i in range(len(table.schema)):
field: pa.Field[t.Any] = table.schema.field(i) # pyright:
ignore[reportUnknownVariableType]
# Check if it's a list of float32 or float64
if pa.types.is_list(field.type):
value_type = field.type.value_type
if pa.types.is_float32(value_type) or
pa.types.is_float64(value_type):
embedding_cols.append(field.name)
return embedding_cols
def _auto_detect_json_extension_columns(table: pa.Table) -> t.List[str]:
"""Detect columns with extension<arrow.json> type.
Args:
table: Arrow table to inspect
Returns:
List of column names that have extension<arrow.json> type
"""
json_cols: t.List[str] = []
for i in range(len(table.schema)):
field = table.schema.field(i) # pyright:
ignore[reportUnknownVariableType]
# Check for JSON extension type
if str(field.type) == "extension<arrow.json>": # pyright:
ignore[reportUnknownArgumentType]
json_cols.append(field.name) # pyright:
ignore[reportUnknownMemberType]
return json_cols
def prepare_arrow_table_for_pg_adbc(
table: pa.Table,
) -> pa.Table:
"""Prepare Arrow table for PostgreSQL ADBC ingestion.
Automatically detects and fixes compatibility issues:
1. List[float] columns → bytes (for pgvector VECTOR type)
2. extension<arrow.json> columns → bytes with 0x01 prefix (for
PostgreSQL JSONB type)
Args:
table: Arrow table to prepare for PostgreSQL
Returns:
Modified Arrow table compatible with PostgreSQL ADBC driver
"""
# Auto-detect embedding columns
embedding_columns = _auto_detect_embedding_columns(table)
# Auto-detect JSON extension columns
jsonb_columns = _auto_detect_json_extension_columns(table)
# Apply pgvector transformation for embedding columns
for col_name in embedding_columns:
if col_name in table.column_names:
vectors = table[col_name].to_pylist()
encoded_array = encode_pgvector_column(vectors)
col_idx = table.schema.get_field_index(col_name)
table = table.set_column(col_idx, col_name, encoded_array)
# Apply JSONB binary encoding for JSON extension columns
for col_name in jsonb_columns:
if col_name in table.column_names:
col_idx = table.schema.get_field_index(col_name)
# Get the JSON data as Python objects/strings
json_data = table[col_name].to_pylist()
# Encode to JSONB binary format
encoded_array = encode_jsonb_column(json_data)
table = table.set_column(col_idx, col_name, encoded_array)
return table
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]