Re: [I] Jsonb in postgres and extension are not supported for writer [arrow-adbc]

via GitHub Fri, 15 Aug 2025 12:59:09 -0700


ShravanSunder commented on issue #3293:
URL: https://github.com/apache/arrow-adbc/issues/3293#issuecomment-3192579849


   My work around for jsonb and for pgvector (vector) is below
   
   ```
   import json
   import struct
   import typing as t
   
   import pyarrow as pa
   
   
   def encode_pgvector_column(vectors: t.List[t.List[float] | None]) -> 
pa.BinaryArray:
       """Encode vectors to pgvector binary format for ADBC.
   
       Args:
           vectors: List of float vectors or None values
   
       Returns:
           PyArrow array with binary-encoded vectors for pgvector
   
       """
   
       def encode_single(vec: t.List[float] | None) -> bytes | None:
           if vec is None:
               return None
           dim = len(vec)
           header = struct.pack(">HH", dim, 0)
           payload = struct.pack(f">{dim}f", *vec) if dim else b""
           return header + payload
   
       encoded = [encode_single(v) for v in vectors]
       return pa.array(encoded, type=pa.binary())
   
   
   def encode_jsonb_column(json_data: t.List[t.Any]) -> pa.BinaryArray:
       """Encode JSON data to PostgreSQL JSONB binary format for ADBC.
   
       PostgreSQL JSONB binary format: 0x01 (version byte) + UTF-8 encoded JSON
   
       Args:
           json_data: List of JSON-serializable objects (dicts, lists, strings 
from extension<arrow.json>)
   
       Returns:
           PyArrow array with binary-encoded JSONB for PostgreSQL
   
       """
       encoded: t.List[bytes | None] = []
       for item in json_data:
           if item is None:
               encoded.append(None)
           else:
               # If it's already a JSON string (from extension<arrow.json>), 
use it directly
               # Otherwise serialize to JSON
               if isinstance(item, str):
                   json_str = item
               else:
                   # Compact JSON (no extra whitespace)
                   json_str = json.dumps(item, separators=(",", ":"), 
ensure_ascii=False)
   
               # JSONB binary format: version byte 0x01 + UTF-8 JSON
               encoded.append(b"\x01" + json_str.encode("utf-8"))
   
       return pa.array(encoded, type=pa.binary())
   
   
   def _auto_detect_embedding_columns(table: pa.Table) -> t.List[str]:
       """Detect columns that look like embeddings (list of floats).
   
       Args:
           table: Arrow table to inspect
   
       Returns:
           List of column names that are list[float] types
   
       """
       embedding_cols: t.List[str] = []
       for i in range(len(table.schema)):
           field: pa.Field[t.Any] = table.schema.field(i)  # pyright: 
ignore[reportUnknownVariableType]
           # Check if it's a list of float32 or float64
           if pa.types.is_list(field.type):
               value_type = field.type.value_type
               if pa.types.is_float32(value_type) or 
pa.types.is_float64(value_type):
                   embedding_cols.append(field.name)
       return embedding_cols
   
   
   def _auto_detect_json_extension_columns(table: pa.Table) -> t.List[str]:
       """Detect columns with extension<arrow.json> type.
   
       Args:
           table: Arrow table to inspect
   
       Returns:
           List of column names that have extension<arrow.json> type
   
       """
       json_cols: t.List[str] = []
       for i in range(len(table.schema)):
           field = table.schema.field(i)  # pyright: 
ignore[reportUnknownVariableType]
           # Check for JSON extension type
           if str(field.type) == "extension<arrow.json>":  # pyright: 
ignore[reportUnknownArgumentType]
               json_cols.append(field.name)  # pyright: 
ignore[reportUnknownMemberType]
       return json_cols
   
   
   def prepare_arrow_table_for_pg_adbc(
       table: pa.Table,
   ) -> pa.Table:
       """Prepare Arrow table for PostgreSQL ADBC ingestion.
   
       Automatically detects and fixes compatibility issues:
       1. List[float] columns → bytes (for pgvector VECTOR type)
       2. extension<arrow.json> columns → bytes with 0x01 prefix (for 
PostgreSQL JSONB type)
   
       Args:
           table: Arrow table to prepare for PostgreSQL
   
       Returns:
           Modified Arrow table compatible with PostgreSQL ADBC driver
   
       """
       # Auto-detect embedding columns
       embedding_columns = _auto_detect_embedding_columns(table)
   
       # Auto-detect JSON extension columns
       jsonb_columns = _auto_detect_json_extension_columns(table)
   
       # Apply pgvector transformation for embedding columns
       for col_name in embedding_columns:
           if col_name in table.column_names:
               vectors = table[col_name].to_pylist()
               encoded_array = encode_pgvector_column(vectors)
               col_idx = table.schema.get_field_index(col_name)
               table = table.set_column(col_idx, col_name, encoded_array)
   
       # Apply JSONB binary encoding for JSON extension columns
       for col_name in jsonb_columns:
           if col_name in table.column_names:
               col_idx = table.schema.get_field_index(col_name)
               # Get the JSON data as Python objects/strings
               json_data = table[col_name].to_pylist()
               # Encode to JSONB binary format
               encoded_array = encode_jsonb_column(json_data)
               table = table.set_column(col_idx, col_name, encoded_array)
   
       return table
   
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [I] Jsonb in postgres and extension are not supported for writer [arrow-adbc]

Reply via email to