This is an automated email from the ASF dual-hosted git repository. timsaucer pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push: new 03921d96 feat: dataframe string formatter (#1170) 03921d96 is described below commit 03921d9629c576afc43b63ad67172f70abcd73e8 Author: Tim Saucer <timsau...@gmail.com> AuthorDate: Wed Jun 25 08:20:49 2025 -0400 feat: dataframe string formatter (#1170) * Add string formatter * Rename html_formatter to dataframe_formatter * Add deprecation warning * Small adjustments based on user feedback --- python/datafusion/__init__.py | 2 +- python/datafusion/dataframe.py | 15 +- .../{html_formatter.py => dataframe_formatter.py} | 30 +- python/datafusion/html_formatter.py | 700 +-------------------- python/tests/test_dataframe.py | 4 +- src/dataframe.rs | 109 ++-- 6 files changed, 119 insertions(+), 741 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 69062fd3..fd7f4fc0 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -46,11 +46,11 @@ from .context import ( SQLOptions, ) from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions +from .dataframe_formatter import configure_formatter from .expr import ( Expr, WindowFrame, ) -from .html_formatter import configure_formatter from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 49c0b2f8..991e6875 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -52,7 +52,6 @@ if TYPE_CHECKING: import polars as pl import pyarrow as pa - from datafusion._internal import DataFrame as DataFrameInternal from datafusion._internal import expr as expr_internal from enum import Enum @@ -327,6 +326,20 @@ class DataFrame: def _repr_html_(self) -> str: return self.df._repr_html_() + @staticmethod + def default_str_repr( + batches: list[pa.RecordBatch], + schema: pa.Schema, + has_more: bool, + table_uuid: str | None = None, + ) -> str: + """Return the default string representation of a DataFrame. + + This method is used by the default formatter and implemented in Rust for + performance reasons. + """ + return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) + def describe(self) -> DataFrame: """Return the statistics for this DataFrame. diff --git a/python/datafusion/html_formatter.py b/python/datafusion/dataframe_formatter.py similarity index 96% copy from python/datafusion/html_formatter.py copy to python/datafusion/dataframe_formatter.py index 12a7e455..27f00f9c 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -26,6 +26,8 @@ from typing import ( runtime_checkable, ) +from datafusion._internal import DataFrame as DataFrameInternal + def _validate_positive_int(value: Any, param_name: str) -> None: """Validate that a parameter is a positive integer. @@ -269,7 +271,7 @@ class DataFrameHtmlFormatter: True if styles have been loaded, False otherwise Example: - >>> from datafusion.html_formatter import DataFrameHtmlFormatter + >>> from datafusion.dataframe_formatter import DataFrameHtmlFormatter >>> DataFrameHtmlFormatter.is_styles_loaded() False """ @@ -345,6 +347,32 @@ class DataFrameHtmlFormatter: return "\n".join(html) + def format_str( + self, + batches: list, + schema: Any, + has_more: bool = False, + table_uuid: str | None = None, + ) -> str: + """Format record batches as a string. + + This method is used by DataFrame's __repr__ implementation and can be + called directly when string rendering is needed. + + Args: + batches: List of Arrow RecordBatch objects + schema: Arrow Schema object + has_more: Whether there are more batches not shown + table_uuid: Unique ID for the table, used for JavaScript interactions + + Returns: + String representation of the data + + Raises: + TypeError: If schema is invalid and no batches are provided + """ + return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) + def _build_html_header(self) -> list[str]: """Build the HTML header with CSS styles.""" html = [] diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index 12a7e455..65eb1f04 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -14,698 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""HTML formatting utilities for DataFusion DataFrames.""" -from __future__ import annotations +"""Deprecated module for dataframe formatting.""" -from typing import ( - Any, - Callable, - Optional, - Protocol, - runtime_checkable, -) - - -def _validate_positive_int(value: Any, param_name: str) -> None: - """Validate that a parameter is a positive integer. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - ValueError: If the value is not a positive integer - """ - if not isinstance(value, int) or value <= 0: - msg = f"{param_name} must be a positive integer" - raise ValueError(msg) - - -def _validate_bool(value: Any, param_name: str) -> None: - """Validate that a parameter is a boolean. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - TypeError: If the value is not a boolean - """ - if not isinstance(value, bool): - msg = f"{param_name} must be a boolean" - raise TypeError(msg) - - -@runtime_checkable -class CellFormatter(Protocol): - """Protocol for cell value formatters.""" - - def __call__(self, value: Any) -> str: - """Format a cell value to string representation.""" - ... - - -@runtime_checkable -class StyleProvider(Protocol): - """Protocol for HTML style providers.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells.""" - ... - - def get_header_style(self) -> str: - """Get the CSS style for header cells.""" - ... - - -class DefaultStyleProvider: - """Default implementation of StyleProvider.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "white-space: nowrap;" - ) - - def get_header_style(self) -> str: - """Get the CSS style for header cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " - "max-width: fit-content;" - ) - - -class DataFrameHtmlFormatter: - """Configurable HTML formatter for DataFusion DataFrames. - - This class handles the HTML rendering of DataFrames for display in - Jupyter notebooks and other rich display contexts. - - This class supports extension through composition. Key extension points: - - Provide a custom StyleProvider for styling cells and headers - - Register custom formatters for specific types - - Provide custom cell builders for specialized cell rendering - - Args: - max_cell_length: Maximum characters to display in a cell before truncation - max_width: Maximum width of the HTML table in pixels - max_height: Maximum height of the HTML table in pixels - max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display - repr_rows: Default number of rows to display in repr output - enable_cell_expansion: Whether to add expand/collapse buttons for long cell - values - custom_css: Additional CSS to include in the HTML output - show_truncation_message: Whether to display a message when data is truncated - style_provider: Custom provider for cell and header styles - use_shared_styles: Whether to load styles and scripts only once per notebook - session - """ - - # Class variable to track if styles have been loaded in the notebook - _styles_loaded = False - - def __init__( - self, - max_cell_length: int = 25, - max_width: int = 1000, - max_height: int = 300, - max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 20, - repr_rows: int = 10, - enable_cell_expansion: bool = True, - custom_css: Optional[str] = None, - show_truncation_message: bool = True, - style_provider: Optional[StyleProvider] = None, - use_shared_styles: bool = True, - ) -> None: - """Initialize the HTML formatter. - - Parameters - ---------- - max_cell_length : int, default 25 - Maximum length of cell content before truncation. - max_width : int, default 1000 - Maximum width of the displayed table in pixels. - max_height : int, default 300 - Maximum height of the displayed table in pixels. - max_memory_bytes : int, default 2097152 (2MB) - Maximum memory in bytes for rendered data. - min_rows_display : int, default 20 - Minimum number of rows to display. - repr_rows : int, default 10 - Default number of rows to display in repr output. - enable_cell_expansion : bool, default True - Whether to allow cells to expand when clicked. - custom_css : str, optional - Custom CSS to apply to the HTML table. - show_truncation_message : bool, default True - Whether to show a message indicating that content has been truncated. - style_provider : StyleProvider, optional - Provider of CSS styles for the HTML table. If None, DefaultStyleProvider - is used. - use_shared_styles : bool, default True - Whether to use shared styles across multiple tables. - - Raises: - ------ - ValueError - If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display, or repr_rows is not a positive integer. - TypeError - If enable_cell_expansion, show_truncation_message, or use_shared_styles is - not a boolean, - or if custom_css is provided but is not a string, - or if style_provider is provided but does not implement the StyleProvider - protocol. - """ - # Validate numeric parameters - _validate_positive_int(max_cell_length, "max_cell_length") - _validate_positive_int(max_width, "max_width") - _validate_positive_int(max_height, "max_height") - _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") - _validate_positive_int(repr_rows, "repr_rows") - - # Validate boolean parameters - _validate_bool(enable_cell_expansion, "enable_cell_expansion") - _validate_bool(show_truncation_message, "show_truncation_message") - _validate_bool(use_shared_styles, "use_shared_styles") - - # Validate custom_css - if custom_css is not None and not isinstance(custom_css, str): - msg = "custom_css must be None or a string" - raise TypeError(msg) - - # Validate style_provider - if style_provider is not None and not isinstance(style_provider, StyleProvider): - msg = "style_provider must implement the StyleProvider protocol" - raise TypeError(msg) - - self.max_cell_length = max_cell_length - self.max_width = max_width - self.max_height = max_height - self.max_memory_bytes = max_memory_bytes - self.min_rows_display = min_rows_display - self.repr_rows = repr_rows - self.enable_cell_expansion = enable_cell_expansion - self.custom_css = custom_css - self.show_truncation_message = show_truncation_message - self.style_provider = style_provider or DefaultStyleProvider() - self.use_shared_styles = use_shared_styles - # Registry for custom type formatters - self._type_formatters: dict[type, CellFormatter] = {} - # Custom cell builders - self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None - self._custom_header_builder: Optional[Callable[[Any], str]] = None - - def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: - """Register a custom formatter for a specific data type. - - Args: - type_class: The type to register a formatter for - formatter: Function that takes a value of the given type and returns - a formatted string - """ - self._type_formatters[type_class] = formatter - - def set_custom_cell_builder( - self, builder: Callable[[Any, int, int, str], str] - ) -> None: - """Set a custom cell builder function. - - Args: - builder: Function that takes (value, row, col, table_id) and returns HTML - """ - self._custom_cell_builder = builder - - def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: - """Set a custom header builder function. - - Args: - builder: Function that takes a field and returns HTML - """ - self._custom_header_builder = builder - - @classmethod - def is_styles_loaded(cls) -> bool: - """Check if HTML styles have been loaded in the current session. - - This method is primarily intended for debugging UI rendering issues - related to style loading. - - Returns: - True if styles have been loaded, False otherwise - - Example: - >>> from datafusion.html_formatter import DataFrameHtmlFormatter - >>> DataFrameHtmlFormatter.is_styles_loaded() - False - """ - return cls._styles_loaded - - def format_html( - self, - batches: list, - schema: Any, - has_more: bool = False, - table_uuid: str | None = None, - ) -> str: - """Format record batches as HTML. - - This method is used by DataFrame's _repr_html_ implementation and can be - called directly when custom HTML rendering is needed. - - Args: - batches: List of Arrow RecordBatch objects - schema: Arrow Schema object - has_more: Whether there are more batches not shown - table_uuid: Unique ID for the table, used for JavaScript interactions - - Returns: - HTML string representation of the data - - Raises: - TypeError: If schema is invalid and no batches are provided - """ - if not batches: - return "No data to display" - - # Validate schema - if schema is None or not hasattr(schema, "__iter__"): - msg = "Schema must be provided" - raise TypeError(msg) - - # Generate a unique ID if none provided - table_uuid = table_uuid or f"df-{id(batches)}" - - # Build HTML components - html = [] - - # Only include styles and scripts if: - # 1. Not using shared styles, OR - # 2. Using shared styles but they haven't been loaded yet - include_styles = ( - not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded - ) - - if include_styles: - html.extend(self._build_html_header()) - # If we're using shared styles, mark them as loaded - if self.use_shared_styles: - DataFrameHtmlFormatter._styles_loaded = True - - html.extend(self._build_table_container_start()) - - # Add table header and body - html.extend(self._build_table_header(schema)) - html.extend(self._build_table_body(batches, table_uuid)) - - html.append("</table>") - html.append("</div>") - - # Add footer (JavaScript and messages) - if include_styles and self.enable_cell_expansion: - html.append(self._get_javascript()) - - # Always add truncation message if needed (independent of styles) - if has_more and self.show_truncation_message: - html.append("<div>Data truncated due to size.</div>") - - return "\n".join(html) - - def _build_html_header(self) -> list[str]: - """Build the HTML header with CSS styles.""" - html = [] - html.append("<style>") - # Only include expandable CSS if cell expansion is enabled - if self.enable_cell_expansion: - html.append(self._get_default_css()) - if self.custom_css: - html.append(self.custom_css) - html.append("</style>") - return html +import warnings - def _build_table_container_start(self) -> list[str]: - """Build the opening tags for the table container.""" - html = [] - html.append( - f'<div style="width: 100%; max-width: {self.max_width}px; ' - f"max-height: {self.max_height}px; overflow: auto; border: " - '1px solid #ccc;">' - ) - html.append('<table style="border-collapse: collapse; min-width: 100%">') - return html +from datafusion.dataframe_formatter import * # noqa: F403 - def _build_table_header(self, schema: Any) -> list[str]: - """Build the HTML table header with column names.""" - html = [] - html.append("<thead>") - html.append("<tr>") - for field in schema: - if self._custom_header_builder: - html.append(self._custom_header_builder(field)) - else: - html.append( - f"<th style='{self.style_provider.get_header_style()}'>" - f"{field.name}</th>" - ) - html.append("</tr>") - html.append("</thead>") - return html - - def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: - """Build the HTML table body with data rows.""" - html = [] - html.append("<tbody>") - - row_count = 0 - for batch in batches: - for row_idx in range(batch.num_rows): - row_count += 1 - html.append("<tr>") - - for col_idx, column in enumerate(batch.columns): - # Get the raw value from the column - raw_value = self._get_cell_value(column, row_idx) - - # Always check for type formatters first to format the value - formatted_value = self._format_cell_value(raw_value) - - # Then apply either custom cell builder or standard cell formatting - if self._custom_cell_builder: - # Pass both the raw value and formatted value to let the - # builder decide - cell_html = self._custom_cell_builder( - raw_value, row_count, col_idx, table_uuid - ) - html.append(cell_html) - else: - # Standard cell formatting with formatted value - if ( - len(str(raw_value)) > self.max_cell_length - and self.enable_cell_expansion - ): - cell_html = self._build_expandable_cell( - formatted_value, row_count, col_idx, table_uuid - ) - else: - cell_html = self._build_regular_cell(formatted_value) - html.append(cell_html) - - html.append("</tr>") - - html.append("</tbody>") - return html - - def _get_cell_value(self, column: Any, row_idx: int) -> Any: - """Extract a cell value from a column. - - Args: - column: Arrow array - row_idx: Row index - - Returns: - The raw cell value - """ - try: - value = column[row_idx] - - if hasattr(value, "as_py"): - return value.as_py() - except (AttributeError, TypeError): - pass - else: - return value - - def _format_cell_value(self, value: Any) -> str: - """Format a cell value for display. - - Uses registered type formatters if available. - - Args: - value: The cell value to format - - Returns: - Formatted cell value as string - """ - # Check for custom type formatters - for type_cls, formatter in self._type_formatters.items(): - if isinstance(value, type_cls): - return formatter(value) - - # If no formatter matched, return string representation - return str(value) - - def _build_expandable_cell( - self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str - ) -> str: - """Build an expandable cell for long content.""" - short_value = str(formatted_value)[: self.max_cell_length] - return ( - f"<td style='{self.style_provider.get_cell_style()}'>" - f"<div class='expandable-container'>" - "<span class='expandable' " - f"id='{table_uuid}-min-text-{row_count}-{col_idx}'>" - f"{short_value}</span>" - "<span class='full-text' " - f"id='{table_uuid}-full-text-{row_count}-{col_idx}'>" - f"{formatted_value}</span>" - f"<button class='expand-btn' " - f"onclick=\"toggleDataFrameCellText('{table_uuid}',{row_count},{col_idx})\">" - f"...</button>" - f"</div>" - f"</td>" - ) - - def _build_regular_cell(self, formatted_value: str) -> str: - """Build a regular table cell.""" - return ( - f"<td style='{self.style_provider.get_cell_style()}'>{formatted_value}</td>" - ) - - def _build_html_footer(self, has_more: bool) -> list[str]: - """Build the HTML footer with JavaScript and messages.""" - html = [] - - # Add JavaScript for interactivity only if cell expansion is enabled - # and we're not using the shared styles approach - if self.enable_cell_expansion and not self.use_shared_styles: - html.append(self._get_javascript()) - - # Add truncation message if needed - if has_more and self.show_truncation_message: - html.append("<div>Data truncated due to size.</div>") - - return html - - def _get_default_css(self) -> str: - """Get default CSS styles for the HTML table.""" - return """ - .expandable-container { - display: inline-block; - max-width: 200px; - } - .expandable { - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - display: block; - } - .full-text { - display: none; - white-space: normal; - } - .expand-btn { - cursor: pointer; - color: blue; - text-decoration: underline; - border: none; - background: none; - font-size: inherit; - display: block; - margin-top: 5px; - } - """ - - def _get_javascript(self) -> str: - """Get JavaScript code for interactive elements.""" - return """ - <script> - function toggleDataFrameCellText(table_uuid, row, col) { - var shortText = document.getElementById( - table_uuid + "-min-text-" + row + "-" + col - ); - var fullText = document.getElementById( - table_uuid + "-full-text-" + row + "-" + col - ); - var button = event.target; - - if (fullText.style.display === "none") { - shortText.style.display = "none"; - fullText.style.display = "inline"; - button.textContent = "(less)"; - } else { - shortText.style.display = "inline"; - fullText.style.display = "none"; - button.textContent = "..."; - } - } - </script> - """ - - -class FormatterManager: - """Manager class for the global DataFrame HTML formatter instance.""" - - _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() - - @classmethod - def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - """ - cls._default_formatter = formatter - _refresh_formatter_reference() - - @classmethod - def get_formatter(cls) -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - Returns: - The global HTML formatter instance - """ - return cls._default_formatter - - -def get_formatter() -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - This function is used by the DataFrame._repr_html_ implementation to access - the shared formatter instance. It can also be used directly when custom - HTML rendering is needed. - - Returns: - The global HTML formatter instance - - Example: - >>> from datafusion.html_formatter import get_formatter - >>> formatter = get_formatter() - >>> formatter.max_cell_length = 50 # Increase cell length - """ - return FormatterManager.get_formatter() - - -def set_formatter(formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - - Example: - >>> from datafusion.html_formatter import get_formatter, set_formatter - >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) - >>> set_formatter(custom_formatter) - """ - FormatterManager.set_formatter(formatter) - - -def configure_formatter(**kwargs: Any) -> None: - """Configure the global DataFrame HTML formatter. - - This function creates a new formatter with the provided configuration - and sets it as the global formatter for all DataFrames. - - Args: - **kwargs: Formatter configuration parameters like max_cell_length, - max_width, max_height, enable_cell_expansion, etc. - - Raises: - ValueError: If any invalid parameters are provided - - Example: - >>> from datafusion.html_formatter import configure_formatter - >>> configure_formatter( - ... max_cell_length=50, - ... max_height=500, - ... enable_cell_expansion=True, - ... use_shared_styles=True - ... ) - """ - # Valid parameters accepted by DataFrameHtmlFormatter - valid_params = { - "max_cell_length", - "max_width", - "max_height", - "max_memory_bytes", - "min_rows_display", - "repr_rows", - "enable_cell_expansion", - "custom_css", - "show_truncation_message", - "style_provider", - "use_shared_styles", - } - - # Check for invalid parameters - invalid_params = set(kwargs) - valid_params - if invalid_params: - msg = ( - f"Invalid formatter parameters: {', '.join(invalid_params)}. " - f"Valid parameters are: {', '.join(valid_params)}" - ) - raise ValueError(msg) - - # Create and set formatter with validated parameters - set_formatter(DataFrameHtmlFormatter(**kwargs)) - - -def reset_formatter() -> None: - """Reset the global DataFrame HTML formatter to default settings. - - This function creates a new formatter with default configuration - and sets it as the global formatter for all DataFrames. - - Example: - >>> from datafusion.html_formatter import reset_formatter - >>> reset_formatter() # Reset formatter to default settings - """ - formatter = DataFrameHtmlFormatter() - # Reset the styles_loaded flag to ensure styles will be reloaded - DataFrameHtmlFormatter._styles_loaded = False - set_formatter(formatter) - - -def reset_styles_loaded_state() -> None: - """Reset the styles loaded state to force reloading of styles. - - This can be useful when switching between notebook sessions or - when styles need to be refreshed. - - Example: - >>> from datafusion.html_formatter import reset_styles_loaded_state - >>> reset_styles_loaded_state() # Force styles to reload in next render - """ - DataFrameHtmlFormatter._styles_loaded = False - - -def _refresh_formatter_reference() -> None: - """Refresh formatter reference in any modules using it. - - This helps ensure that changes to the formatter are reflected in existing - DataFrames that might be caching the formatter reference. - """ - # This is a no-op but signals modules to refresh their reference +warnings.warn( + "The module 'html_formatter' is deprecated and will be removed in the next release." + "Please use 'dataframe_formatter' instead.", + DeprecationWarning, + stacklevel=3, +) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index deaa30b3..c9ae38d8 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -37,14 +37,14 @@ from datafusion import ( from datafusion import ( functions as f, ) -from datafusion.expr import Window -from datafusion.html_formatter import ( +from datafusion.dataframe_formatter import ( DataFrameHtmlFormatter, configure_formatter, get_formatter, reset_formatter, reset_styles_loaded_state, ) +from datafusion.expr import Window from pyarrow.csv import write_csv MB = 1024 * 1024 diff --git a/src/dataframe.rs b/src/dataframe.rs index 3d68db27..c2ad4771 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -24,6 +24,7 @@ use arrow::compute::can_cast_types; use arrow::error::ArrowError; use arrow::ffi::FFI_ArrowSchema; use arrow::ffi_stream::FFI_ArrowArrayStream; +use arrow::pyarrow::FromPyArrow; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; @@ -150,9 +151,9 @@ fn get_python_formatter_with_config(py: Python) -> PyResult<PythonFormatter> { Ok(PythonFormatter { formatter, config }) } -/// Get the Python formatter from the datafusion.html_formatter module +/// Get the Python formatter from the datafusion.dataframe_formatter module fn import_python_formatter(py: Python) -> PyResult<Bound<'_, PyAny>> { - let formatter_module = py.import("datafusion.html_formatter")?; + let formatter_module = py.import("datafusion.dataframe_formatter")?; let get_formatter = formatter_module.getattr("get_formatter")?; get_formatter.call0() } @@ -295,6 +296,46 @@ impl PyDataFrame { pub fn new(df: DataFrame) -> Self { Self { df: Arc::new(df) } } + + fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult<String> { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); + } + + let table_uuid = uuid::Uuid::new_v4().to_string(); + + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::<PyResult<Vec<PyObject>>>()?; + + let py_schema = self.schema().into_pyobject(py)?; + + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; + + let method_name = match as_html { + true => "format_html", + false => "format_str", + }; + + let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; + + Ok(html_str) + } } #[pymethods] @@ -321,18 +362,27 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult<String> { - // Get the Python formatter config - let PythonFormatter { - formatter: _, - config, - } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + self.prepare_repr_string(py, false) + } + + #[staticmethod] + #[expect(unused_variables)] + fn default_str_repr<'py>( + batches: Vec<Bound<'py, PyAny>>, + schema: &Bound<'py, PyAny>, + has_more: bool, + table_uuid: &str, + ) -> PyResult<String> { + let batches = batches + .into_iter() + .map(|batch| RecordBatch::from_pyarrow_bound(&batch)) + .collect::<PyResult<Vec<RecordBatch>>>()? + .into_iter() + .filter(|batch| batch.num_rows() > 0) + .collect::<Vec<_>>(); + if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); + return Ok("No data to display".to_owned()); } let batches_as_displ = @@ -347,38 +397,7 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> { - // Get the Python formatter and config - let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; - if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); - } - - let table_uuid = uuid::Uuid::new_v4().to_string(); - - // Convert record batches to PyObject list - let py_batches = batches - .into_iter() - .map(|rb| rb.to_pyarrow(py)) - .collect::<PyResult<Vec<PyObject>>>()?; - - let py_schema = self.schema().into_pyobject(py)?; - - let kwargs = pyo3::types::PyDict::new(py); - let py_batches_list = PyList::new(py, py_batches.as_slice())?; - kwargs.set_item("batches", py_batches_list)?; - kwargs.set_item("schema", py_schema)?; - kwargs.set_item("has_more", has_more)?; - kwargs.set_item("table_uuid", table_uuid)?; - - let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; - let html_str: String = html_result.extract()?; - - Ok(html_str) + self.prepare_repr_string(py, true) } /// Calculate summary statistics for a DataFrame --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org