This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new ab395c1b31 [python] Add 'paimon table explain' CLI command (#7896)
ab395c1b31 is described below
commit ab395c1b3184a1dcde8fedc0e7d6c2d6cd189434
Author: chaoyang <[email protected]>
AuthorDate: Wed May 20 19:11:42 2026 +0800
[python] Add 'paimon table explain' CLI command (#7896)
Add `paimon table explain <db.tbl>` CLI subcommand exposing
`ReadBuilder.explain()` (#7869). Supports `--where`, `--select`,
`--limit`, `--verbose`, `--format table|json`, mirroring `paimon table
read`. Follow-up of #7869 — also removes the CLI TODO left next to
`ReadBuilder.explain`.
---
docs/content/pypaimon/cli.md | 61 +++++++++++
docs/content/pypaimon/python-api.md | 21 ++++
paimon-python/pypaimon/cli/cli_table.py | 139 ++++++++++++++++++++++++-
paimon-python/pypaimon/read/read_builder.py | 3 -
paimon-python/pypaimon/tests/cli_table_test.py | 137 ++++++++++++++++++++++++
5 files changed, 357 insertions(+), 4 deletions(-)
diff --git a/docs/content/pypaimon/cli.md b/docs/content/pypaimon/cli.md
index 6485ca5876..d5ee2b7e2a 100644
--- a/docs/content/pypaimon/cli.md
+++ b/docs/content/pypaimon/cli.md
@@ -156,6 +156,67 @@ Output:
5 Eve 32 Hangzhou
```
+### Table Explain
+
+Show the scan plan of a query without reading any data: the target snapshot,
the pushed-down predicate / projection / limit, the partition / bucket /
file-stats pruning funnel, and split-level signals (raw-convertible ratio,
deletion-vector ratio, level histogram, files-per-split and split-size
distribution). Useful for previewing the pruning effect of a predicate before
actually running the read.
+
+```shell
+paimon table explain mydb.events
+```
+
+**Options:**
+
+- `--select, -s`: Project specific columns (comma-separated)
+- `--where, -w`: Filter condition in SQL-like syntax (same operators as `table
read`)
+- `--limit, -l`: Row limit to push down
+- `--verbose, -v`: List every split with its files
+- `--format, -f`: Output format: `table` (default) or `json`
+
+**Examples:**
+
+```shell
+# Whole-table scan plan
+paimon table explain mydb.events
+
+# Push filter and projection through the planner
+paimon table explain mydb.events --where "dt = '2026-05-16' AND id = 7" -s
dt,id,val
+
+# List every split (and its files) instead of just the aggregates
+paimon table explain mydb.events -w "dt = '2026-05-16'" --verbose
+
+# Machine-readable output for scripting (level_histogram keys are JSON strings)
+paimon table explain mydb.events --format json
+```
+
+Output:
+```
+== PyPaimon Scan Plan ==
+Table: mydb.events (PK, HASH_FIXED)
+Snapshot: 5 (schema 0)
+Predicate: (dt = '2026-05-16') AND (id = 7)
+Projection: [dt, id, val]
+Limit: <none>
+
+Partition pruning: 20 -> 4 (pruned 16)
+Bucket pruning: 4 -> 1 (pruned 3)
+File skipping: 1 -> 1 (pruned 0)
+
+Splits: 1
+ raw-convertible: 1 / 1
+ with DV: 0 / 1
+ all-above-L0: 0 / 1
+ files/split: min=1 max=1 avg=1.00
+ size/split: min=2.6 KiB p50=2.6 KiB p95=2.6 KiB max=2.6 KiB
+
+Files: 1
+Total size: 2.6 KiB
+Estimated rows: 10 (merged: 10)
+Level histogram: L0=1
+Deletion files: 0
+```
+
+`explain` reads the manifest list and manifest files but never opens any data
files, so it is dramatically cheaper than a real read on large tables.
+
### Table Get
Get and display table schema information in JSON format. The output format is
the same as the schema JSON format used
diff --git a/docs/content/pypaimon/python-api.md
b/docs/content/pypaimon/python-api.md
index e83e1fa506..2c18cf060a 100644
--- a/docs/content/pypaimon/python-api.md
+++ b/docs/content/pypaimon/python-api.md
@@ -660,6 +660,27 @@ What the fields tell you:
`ExplainResult` is a plain dataclass — alongside the human-readable `__str__`
shown above, every field (`partition_pruning`, `bucket_pruning`,
`file_skipping`, `split_count`, `splits_raw_convertible`, `level_histogram`,
`splits`, ...) is addressable in Python for programmatic use.
+#### CLI
+
+The same scan plan is available from the `paimon` command line — useful for
previewing pruning effects of a predicate without writing any Python:
+
+```bash
+# Whole-table scan
+paimon -c paimon.yaml table explain default.events
+
+# Push down filter / projection / limit and list every split
+paimon -c paimon.yaml table explain default.events \
+ --where "dt = '2026-05-16' AND id = 7" \
+ --select dt,id,val \
+ --limit 100 \
+ --verbose
+
+# Machine-readable output (level_histogram keys are JSON strings)
+paimon -c paimon.yaml table explain default.events --format json
+```
+
+`--where` accepts the same SQL-like syntax as `paimon table read`. With
`--format json`, the result is a structured dump of `ExplainResult` suitable
for piping into `jq` or further processing.
+
## Rollback
Paimon supports rolling back a table to a previous snapshot or tag. This is
useful for undoing unwanted changes or
diff --git a/paimon-python/pypaimon/cli/cli_table.py
b/paimon-python/pypaimon/cli/cli_table.py
index e428bd2d71..ba8446fcf9 100644
--- a/paimon-python/pypaimon/cli/cli_table.py
+++ b/paimon-python/pypaimon/cli/cli_table.py
@@ -22,6 +22,8 @@ This module provides table-related commands for the CLI.
"""
import sys
+from dataclasses import asdict
+
from pypaimon.common.json_util import JSON
@@ -147,6 +149,98 @@ def cmd_table_read(args):
print(df.to_string(index=False))
+def cmd_table_explain(args):
+ """
+ Execute the 'table explain' command.
+
+ Prints the scan plan (snapshot, pushed-down predicate / projection /
+ limit, partition / bucket / file-stats pruning funnel and split-
+ level signals) without reading any data files.
+ """
+ from pypaimon.cli.cli import load_catalog_config, create_catalog
+
+ config = load_catalog_config(args.config)
+ catalog = create_catalog(config)
+
+ table_identifier = args.table
+ parts = table_identifier.split('.')
+ if len(parts) != 2:
+ print(f"Error: Invalid table identifier '{table_identifier}'. "
+ f"Expected format: 'database.table'", file=sys.stderr)
+ sys.exit(1)
+ database_name, table_name = parts
+
+ try:
+ table = catalog.get_table(f"{database_name}.{table_name}")
+ except Exception as e:
+ print(f"Error: Failed to get table '{table_identifier}': {e}",
file=sys.stderr)
+ sys.exit(1)
+
+ read_builder = table.new_read_builder()
+ available_fields = set(field.name for field in table.table_schema.fields)
+
+ select_columns = getattr(args, 'select', None)
+ if select_columns:
+ user_columns = [col.strip() for col in select_columns.split(',')]
+ invalid_columns = [col for col in user_columns if col not in
available_fields]
+ if invalid_columns:
+ print(f"Error: Column(s) {invalid_columns} do not exist in table
'{table_identifier}'.",
+ file=sys.stderr)
+ sys.exit(1)
+ read_builder = read_builder.with_projection(user_columns)
+
+ where_clause = getattr(args, 'where', None)
+ if where_clause:
+ from pypaimon.cli.where_parser import parse_where_clause
+ try:
+ predicate = parse_where_clause(where_clause,
table.table_schema.fields)
+ if predicate:
+ read_builder = read_builder.with_filter(predicate)
+ except ValueError as e:
+ print(f"Error: Invalid WHERE clause: {e}", file=sys.stderr)
+ sys.exit(1)
+
+ # Unlike `table read`, explain always pushes the limit down — the
+ # whole point of explain is to show what the planner will see,
+ # including limit pushdown.
+ limit = getattr(args, 'limit', None)
+ if limit is not None:
+ read_builder = read_builder.with_limit(limit)
+
+ verbose = getattr(args, 'verbose', False)
+ try:
+ result = read_builder.explain(verbose=verbose)
+ except Exception as e:
+ print(f"Error: Failed to explain table '{table_identifier}': {e}",
file=sys.stderr)
+ sys.exit(1)
+
+ output_format = getattr(args, 'format', 'table')
+ if output_format == 'json':
+ import json
+ print(json.dumps(_explain_result_to_json_dict(result), indent=2,
ensure_ascii=False))
+ else:
+ print(str(result))
+
+
+def _explain_result_to_json_dict(result):
+ """Serialize an ``ExplainResult`` to a JSON-friendly dict.
+
+ ``level_histogram`` has ``int`` keys, both at the top level and
+ inside each split. ``json.dumps`` would coerce them to strings
+ silently; we do it up front so the output is explicit and stable.
+ """
+ payload = asdict(result)
+ payload['level_histogram'] = {
+ str(level): count for level, count in payload.get('level_histogram',
{}).items()
+ }
+ if payload.get('splits') is not None:
+ for split in payload['splits']:
+ split['level_histogram'] = {
+ str(level): count for level, count in
split.get('level_histogram', {}).items()
+ }
+ return payload
+
+
def cmd_table_full_text_search(args):
"""
Execute the 'table full-text-search' command.
@@ -827,7 +921,50 @@ def add_table_subcommands(table_parser):
help='Output format: table (default) or json'
)
read_parser.set_defaults(func=cmd_table_read)
-
+
+ # table explain command
+ explain_parser = table_subparsers.add_parser(
+ 'explain',
+ help='Show the scan plan (snapshot, pushdown, pruning funnel, split
shape) '
+ 'without reading data'
+ )
+ explain_parser.add_argument(
+ 'table',
+ help='Table identifier in format: database.table'
+ )
+ explain_parser.add_argument(
+ '--select', '-s',
+ type=str,
+ default=None,
+ help='Project specific columns (comma-separated, e.g., "id,name,age")'
+ )
+ explain_parser.add_argument(
+ '--where', '-w',
+ type=str,
+ default=None,
+ help='Filter condition in SQL-like syntax '
+ '(e.g., "age > 18", "dt = \'2026-01-01\' AND id IN (1,2,3)")'
+ )
+ explain_parser.add_argument(
+ '--limit', '-l',
+ type=int,
+ default=None,
+ help='Row limit to push down'
+ )
+ explain_parser.add_argument(
+ '--verbose', '-v',
+ action='store_true',
+ help='List every split with its files'
+ )
+ explain_parser.add_argument(
+ '--format', '-f',
+ type=str,
+ choices=['table', 'json'],
+ default='table',
+ help='Output format: table (default) or json'
+ )
+ explain_parser.set_defaults(func=cmd_table_explain)
+
# table get command
get_parser = table_subparsers.add_parser('get', help='Get table schema
information')
get_parser.add_argument(
diff --git a/paimon-python/pypaimon/read/read_builder.py
b/paimon-python/pypaimon/read/read_builder.py
index 51233856f2..2ee9a39040 100644
--- a/paimon-python/pypaimon/read/read_builder.py
+++ b/paimon-python/pypaimon/read/read_builder.py
@@ -101,9 +101,6 @@ class ReadBuilder:
def new_predicate_builder(self) -> PredicateBuilder:
return PredicateBuilder(self.read_type())
- # TODO: surface this through pypaimon's CLI (alongside cli_sql /
- # cli_table) so users can run `pypaimon explain ...` against a table
- # without writing any Python.
def explain(self, verbose: bool = False) -> ExplainResult:
"""Produce a structured scan plan for this builder.
diff --git a/paimon-python/pypaimon/tests/cli_table_test.py
b/paimon-python/pypaimon/tests/cli_table_test.py
index b0e314644b..b88eae8b6b 100644
--- a/paimon-python/pypaimon/tests/cli_table_test.py
+++ b/paimon-python/pypaimon/tests/cli_table_test.py
@@ -1439,6 +1439,143 @@ class CliTableTest(unittest.TestCase):
self.assertEqual(len(result.elements), 1)
self.assertEqual(result.elements[0].spec['dt'], '2024-01-02')
+ def test_cli_table_explain_basic(self):
+ """Basic `table explain` prints the render anchors and no data."""
+ with patch('sys.argv',
+ ['paimon', '-c', self.config_file,
+ 'table', 'explain', 'test_db.users']):
+ with patch('sys.stdout', new_callable=StringIO) as mock_stdout:
+ try:
+ main()
+ except SystemExit:
+ pass
+
+ output = mock_stdout.getvalue()
+
+ # render_explain anchors
+ self.assertIn('== PyPaimon Scan Plan ==', output)
+ self.assertIn('Table:', output)
+ self.assertIn('Snapshot:', output)
+ self.assertIn('Splits:', output)
+ self.assertIn('Files:', output)
+ # No data rows: row data ('Alice'/'Bob') should not appear
+ self.assertNotIn('Alice', output)
+ self.assertNotIn('Bob', output)
+
+ def test_cli_table_explain_with_select_and_limit(self):
+ """`--select` and `--limit` are reflected in the Projection / Limit
lines."""
+ with patch('sys.argv',
+ ['paimon', '-c', self.config_file,
+ 'table', 'explain', 'test_db.users',
+ '--select', 'id,name',
+ '--limit', '3']):
+ with patch('sys.stdout', new_callable=StringIO) as mock_stdout:
+ try:
+ main()
+ except SystemExit:
+ pass
+
+ output = mock_stdout.getvalue()
+
+ self.assertIn('Projection:', output)
+ self.assertIn('[id, name]', output)
+ self.assertIn('Limit:', output)
+ self.assertIn('3', output)
+
+ def test_cli_table_explain_verbose_lists_splits(self):
+ """`--verbose` triggers a Splits[] section listing each split."""
+ with patch('sys.argv',
+ ['paimon', '-c', self.config_file,
+ 'table', 'explain', 'test_db.users',
+ '--verbose']):
+ with patch('sys.stdout', new_callable=StringIO) as mock_stdout:
+ try:
+ main()
+ except SystemExit:
+ pass
+
+ output = mock_stdout.getvalue()
+ self.assertIn('Splits[]', output)
+ # The per-split bullet uses "[0] partition=" as a prefix
+ self.assertIn('[0] partition=', output)
+
+ def test_cli_table_explain_where_partition_pruning(self):
+ """A partition predicate fires the partition-pruning funnel."""
+ self._create_partitioned_table()
+
+ with patch('sys.argv',
+ ['paimon', '-c', self.config_file,
+ 'table', 'explain', 'test_db.partitioned',
+ '--where', "dt = '2024-01-01' AND region = 'us'"]):
+ with patch('sys.stdout', new_callable=StringIO) as mock_stdout:
+ try:
+ main()
+ except SystemExit:
+ pass
+
+ output = mock_stdout.getvalue()
+
+ # Predicate is rendered
+ self.assertIn('Predicate:', output)
+ self.assertIn('dt', output)
+ # Partition pruning funnel shows before -> after (pruned N>0)
+ self.assertIn('Partition pruning:', output)
+ self.assertRegex(output, r'Partition pruning:\s+\d+ ->
\d+\s+\(pruned [1-9]\d*\)')
+
+ def test_cli_table_explain_format_json(self):
+ """`--format json` is valid JSON with stringified level_histogram
keys."""
+ import json
+ with patch('sys.argv',
+ ['paimon', '-c', self.config_file,
+ 'table', 'explain', 'test_db.users',
+ '--format', 'json']):
+ with patch('sys.stdout', new_callable=StringIO) as mock_stdout:
+ try:
+ main()
+ except SystemExit:
+ pass
+
+ output = mock_stdout.getvalue()
+ payload = json.loads(output)
+
+ # Top-level identity / snapshot / split aggregates
+ self.assertEqual(payload['table_identifier'], 'test_db.users')
+ self.assertIn('snapshot_id', payload)
+ self.assertIn('split_count', payload)
+ self.assertIn('level_histogram', payload)
+ self.assertIn('partition_pruning', payload)
+
+ # level_histogram keys must be strings (json-safe). When the
+ # table has data, at least one level entry exists.
+ for key in payload['level_histogram'].keys():
+ self.assertIsInstance(key, str)
+
+ # Non-verbose => splits is null
+ self.assertIsNone(payload['splits'])
+
+ def test_cli_table_explain_invalid_table(self):
+ """Unknown table identifier produces a clean error on stderr."""
+ with patch('sys.argv',
+ ['paimon', '-c', self.config_file,
+ 'table', 'explain', 'test_db.does_not_exist']):
+ with patch('sys.stderr', new_callable=StringIO) as mock_stderr:
+ with self.assertRaises(SystemExit) as ctx:
+ main()
+ self.assertEqual(ctx.exception.code, 1)
+ self.assertIn("Failed to get table", mock_stderr.getvalue())
+
+ def test_cli_table_explain_invalid_where(self):
+ """Malformed WHERE produces a clean error on stderr."""
+ with patch('sys.argv',
+ ['paimon', '-c', self.config_file,
+ 'table', 'explain', 'test_db.users',
+ '--where', 'this is not a valid clause']):
+ with patch('sys.stderr', new_callable=StringIO) as mock_stderr:
+ with self.assertRaises(SystemExit) as ctx:
+ main()
+ self.assertEqual(ctx.exception.code, 1)
+ self.assertIn("Invalid WHERE clause", mock_stderr.getvalue())
+
if __name__ == '__main__':
unittest.main()