[GitHub] [spark] xinrong-databricks commented on a diff in pull request #36420: [SPARK-39081][PYTHON][SQL] Implement DataFrame.resample and Series.resample

GitBox Tue, 03 May 2022 10:42:53 -0700


xinrong-databricks commented on code in PR #36420:
URL: https://github.com/apache/spark/pull/36420#discussion_r864030291



##########
python/pyspark/pandas/resample.py:
##########
@@ -0,0 +1,528 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+A wrapper for ResampledData to behave similar to pandas Resampler.
+"""
+from abc import ABCMeta
+from distutils.version import LooseVersion
+import re
+from functools import partial
+from typing import (
+    Any,
+    Generic,
+    List,
+    Optional,
+)
+
+import numpy as np
+
+import pandas as pd
+
+if LooseVersion(pd.__version__) >= LooseVersion("1.3.0"):
+    from pandas.core.common import _builtin_table  # type: ignore[attr-defined]
+else:
+    from pandas.core.base import SelectionMixin
+
+    _builtin_table = SelectionMixin._builtin_table  # type: 
ignore[attr-defined]
+
+from pyspark import SparkContext
+from pyspark.sql import Column, functions as F
+from pyspark.sql.types import (
+    NumericType,
+    StructField,
+    TimestampType,
+)
+
+from pyspark import pandas as ps  # For running doctests and reference 
resolution in PyCharm.
+from pyspark.pandas._typing import FrameLike
+from pyspark.pandas.frame import DataFrame
+from pyspark.pandas.internal import (
+    InternalField,
+    InternalFrame,
+    SPARK_DEFAULT_INDEX_NAME,
+)
+from pyspark.pandas.missing.resample import (
+    MissingPandasLikeDataFrameResampler,
+    MissingPandasLikeSeriesResampler,
+)
+from pyspark.pandas.series import Series, first_series
+from pyspark.pandas.utils import (
+    scol_for,
+    verify_temp_column_name,
+)
+
+
+class Resampler(Generic[FrameLike], metaclass=ABCMeta):
+    """
+    Class for resampling datetimelike data, a groupby-like operation.
+
+    It's easiest to use obj.resample(...) to use Resampler.
+
+    Parameters
+    ----------
+    psdf : DataFrame
+
+    Returns
+    -------
+    a Resampler of the appropriate type
+
+    Notes
+    -----
+    After resampling, see aggregate, apply, and transform functions.
+    """
+
+    def __init__(
+        self,
+        psdf: DataFrame,
+        resamplekey: Optional[Series],
+        rule: str,
+        closed: Optional[str] = None,
+        label: Optional[str] = None,
+        agg_columns: List[Series] = [],
+    ):
+        self._psdf = psdf
+        self._resamplekey = resamplekey
+
+        parsed = re.findall(r"^([0-9]+)?([A-Za-z]+)$", rule)
+        if len(parsed) != 1:
+            raise ValueError("Unsupported freq {}".format(rule))
+
+        offset_str, unit_str = parsed[0]
+        self._freq_offset = 1
+        if offset_str != "":
+            freq_offset = int(offset_str)
+            if not freq_offset > 0:
+                raise ValueError("invalid rule: '{}'".format(rule))
+            self._freq_offset = freq_offset

Review Comment:
   Would you think we may leverage `from pandas._libs.tslibs import to_offset` 
here?
   
   ```py
   >>> to_offset("3MIN").freqstr
   '3T'
   >>> to_offset("3MIN").n
   3
   >>> to_offset("3MIN").rule_code
   'T'
   >>> to_offset("3MI")
   ...
   ValueError: Invalid frequency: MI
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] xinrong-databricks commented on a diff in pull request #36420: [SPARK-39081][PYTHON][SQL] Implement DataFrame.resample and Series.resample

Reply via email to