This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 88f121c47778 [SPARK-46931][PS] Implement `{Frame, Series}.to_hdf`
88f121c47778 is described below
commit 88f121c47778f0755862046d09484a83932cb30b
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Jan 31 08:41:21 2024 -0800
[SPARK-46931][PS] Implement `{Frame, Series}.to_hdf`
### What changes were proposed in this pull request?
Implement `{Frame, Series}.to_hdf`
### Why are the changes needed?
pandas parity
### Does this PR introduce _any_ user-facing change?
yes
```
In [3]: df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['a',
'b', 'c'])
In [4]: df.to_hdf('/tmp/data.h5', key='df', mode='w')
In [5]: psdf = ps.from_pandas(df)
In [6]: psdf.to_hdf('/tmp/data2.h5', key='df', mode='w')
/Users/ruifeng.zheng/Dev/spark/python/pyspark/pandas/utils.py:1015:
PandasAPIOnSparkAdviceWarning: `to_hdf` loads all data into the driver's
memory. It should only be used if the resulting DataFrame is expected to be
small.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
In [7]: !ls /tmp/*h5
/tmp/data.h5 /tmp/data2.h5
In [8]: !ls -lh /tmp/*h5
-rw-r--r-- 1 ruifeng.zheng wheel 6.9K Jan 31 12:21 /tmp/data.h5
-rw-r--r-- 1 ruifeng.zheng wheel 6.9K Jan 31 12:21 /tmp/data2.h5
```
### How was this patch tested?
manually test, `hdf` requires additional library `pytables` which in turn
needs [many
prerequisites](https://www.pytables.org/usersguide/installation.html#prerequisites)
since `pytables` is just a optional dep of `Pandas`, so I think we can
avoid adding it to CI first.
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44966 from zhengruifeng/ps_to_hdf.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../docs/source/reference/pyspark.pandas/frame.rst | 1 +
.../source/reference/pyspark.pandas/series.rst | 1 +
python/pyspark/pandas/generic.py | 120 +++++++++++++++++++++
python/pyspark/pandas/missing/frame.py | 1 -
python/pyspark/pandas/missing/series.py | 1 -
5 files changed, 122 insertions(+), 2 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst
b/python/docs/source/reference/pyspark.pandas/frame.rst
index 12cf6e7db12f..77b60468b8fb 100644
--- a/python/docs/source/reference/pyspark.pandas/frame.rst
+++ b/python/docs/source/reference/pyspark.pandas/frame.rst
@@ -286,6 +286,7 @@ Serialization / IO / Conversion
DataFrame.to_json
DataFrame.to_dict
DataFrame.to_excel
+ DataFrame.to_hdf
DataFrame.to_clipboard
DataFrame.to_markdown
DataFrame.to_records
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst
b/python/docs/source/reference/pyspark.pandas/series.rst
index 88d1861c6ccf..5606fa93a5f3 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -486,6 +486,7 @@ Serialization / IO / Conversion
Series.to_json
Series.to_csv
Series.to_excel
+ Series.to_hdf
Series.to_frame
Pandas-on-Spark specific
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index 77cefb53fe5d..ed2aeb8ea6af 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -1103,6 +1103,126 @@ class Frame(object, metaclass=ABCMeta):
psdf._to_internal_pandas(), self.to_excel, f, args
)
+ def to_hdf(
+ self,
+ path_or_buf: Union[str, pd.HDFStore],
+ key: str,
+ mode: str = "a",
+ complevel: Optional[int] = None,
+ complib: Optional[str] = None,
+ append: bool = False,
+ format: Optional[str] = None,
+ index: bool = True,
+ min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
+ nan_rep: Optional[Any] = None,
+ dropna: Optional[bool] = None,
+ data_columns: Optional[Union[bool, List[str]]] = None,
+ errors: str = "strict",
+ encoding: str = "UTF-8",
+ ) -> None:
+ """
+ Write the contained data to an HDF5 file using HDFStore.
+
+ .. note:: This method should only be used if the resulting DataFrame
is expected
+ to be small, as all the data is loaded into the driver's
memory.
+
+ .. versionadded:: 4.0.0
+
+ Parameters
+ ----------
+ path_or_buf : str or pandas.HDFStore
+ File path or HDFStore object.
+ key : str
+ Identifier for the group in the store.
+ mode : {'a', 'w', 'r+'}, default 'a'
+ Mode to open file:
+
+ - 'w': write, a new file is created (an existing file with
+ the same name would be deleted).
+ - 'a': append, an existing file is opened for reading and
+ writing, and if the file does not exist it is created.
+ - 'r+': similar to 'a', but the file must already exist.
+
+ complevel : {0-9}, default None
+ Specifies a compression level for data.
+ A value of 0 or None disables compression.
+ complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
+ Specifies the compression library to be used.
+ These additional compressors for Blosc are supported
+ (default if no compressor specified: 'blosc:blosclz'):
+ {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+ 'blosc:zlib', 'blosc:zstd'}.
+ Specifying a compression library which is not available issues
+ a ValueError.
+ append : bool, default False
+ For Table formats, append the input data to the existing.
+ format : {'fixed', 'table', None}, default 'fixed'
+ Possible values:
+
+ - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
+ nor searchable.
+ - 'table': Table format. Write as a PyTables Table structure
+ which may perform worse but allow more flexible operations
+ like searching / selecting subsets of the data.
+ - If None, pd.get_option('io.hdf.default_format') is checked,
+ followed by fallback to "fixed".
+
+ index : bool, default True
+ Write DataFrame index as a column.
+ min_itemsize : dict or int, optional
+ Map column names to minimum string sizes for columns.
+ nan_rep : Any, optional
+ How to represent null values as str.
+ Not allowed with append=True.
+ dropna : bool, default False, optional
+ Remove missing values.
+ data_columns : list of columns or True, optional
+ List of columns to create as indexed data columns for on-disk
+ queries, or True to use all columns. By default only the axes
+ of the object are indexed. Applicable only to format='table'.
+ errors : str, default 'strict'
+ Specifies how encoding and decoding errors are to be handled.
+ See the errors argument for :func:`open` for a full list
+ of options.
+ encoding : str, default "UTF-8"
+
+ See Also
+ --------
+ DataFrame.to_orc : Write a DataFrame to the binary orc format.
+ DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+ DataFrame.to_csv : Write out to a csv file.
+
+ Examples
+ --------
+ >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
+ ... index=['a', 'b', 'c']) # doctest: +SKIP
+ >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP
+
+ We can add another object to the same file:
+
+ >>> s = ps.Series([1, 2, 3, 4]) # doctest: +SKIP
+ >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP
+ """
+ log_advice(
+ "`to_hdf` loads all data into the driver's memory. "
+ "It should only be used if the resulting DataFrame is expected to
be small."
+ )
+ # Make sure locals() call is at the top of the function so we don't
capture local variables.
+ args = locals()
+ psdf = self
+
+ if isinstance(self, ps.DataFrame):
+ f = pd.DataFrame.to_hdf
+ elif isinstance(self, ps.Series):
+ f = pd.Series.to_hdf
+ else:
+ raise TypeError(
+ "Constructor expects DataFrame or Series; however, " "got
[%s]" % (self,)
+ )
+ return validate_arguments_and_invoke_function(
+ psdf._to_internal_pandas(), self.to_hdf, f, args
+ )
+
def mean(
self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only:
bool = None
) -> Union[Scalar, "Series"]:
diff --git a/python/pyspark/pandas/missing/frame.py
b/python/pyspark/pandas/missing/frame.py
index 7a4d09d4ea81..25a3a2afa3df 100644
--- a/python/pyspark/pandas/missing/frame.py
+++ b/python/pyspark/pandas/missing/frame.py
@@ -43,7 +43,6 @@ class MissingPandasLikeDataFrame:
reorder_levels = _unsupported_function("reorder_levels")
set_axis = _unsupported_function("set_axis")
to_feather = _unsupported_function("to_feather")
- to_hdf = _unsupported_function("to_hdf")
to_period = _unsupported_function("to_period")
to_sql = _unsupported_function("to_sql")
to_stata = _unsupported_function("to_stata")
diff --git a/python/pyspark/pandas/missing/series.py
b/python/pyspark/pandas/missing/series.py
index 4ee860d6654f..08f21f46b2cc 100644
--- a/python/pyspark/pandas/missing/series.py
+++ b/python/pyspark/pandas/missing/series.py
@@ -40,7 +40,6 @@ class MissingPandasLikeSeries:
infer_objects = _unsupported_function("infer_objects")
reorder_levels = _unsupported_function("reorder_levels")
set_axis = _unsupported_function("set_axis")
- to_hdf = _unsupported_function("to_hdf")
to_period = _unsupported_function("to_period")
to_sql = _unsupported_function("to_sql")
to_timestamp = _unsupported_function("to_timestamp")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]