bzhaoopenstack commented on PR #37232: URL: https://github.com/apache/spark/pull/37232#issuecomment-1192144421
> Is it dependent on pandas version being used? See also https://github.com/apache/spark/blob/master/dev/infra/Dockerfile Hi, I tested with pandas 1.3.X and 1.4.X. That's true that anything is OK and won't raise error. But in pandas master branch, that's true the pandas master still raise error. And in my env, it goes inside different code path. The below is the good ones, pandas 1.3.x and 1.4.x ``` >>> from pyspark import pandas as ps /home/spark/upstream/pandas/pandas/compat/__init__.py:124: UserWarninging to use lzma compression will result in a RuntimeError. warnings.warn(msg) WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not secutor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for >>> a = ps.DatetimeIndex(['1970-01-01', '1970-01-01', '1970-01-01']) > /home/spark/upstream/pandas/pandas/core/arrays/datetimelike.py(390)a -> if is_object_dtype(dtype): (Pdb) c >>> a > /home/spark/upstream/pandas/pandas/core/arrays/datetimelike.py(390)a -> if is_object_dtype(dtype): (Pdb) l 385 # 3. DatetimeArray.astype handles datetime -> period 386 dtype = pandas_dtype(dtype) 387 import pdb; 388 pdb.set_trace() 389 390 -> if is_object_dtype(dtype): 391 return self._box_values(self.asi8.ravel()).reshape 392 elif is_string_dtype(dtype) and not is_categorical_dty 393 if is_extension_array_dtype(dtype): 394 arr_cls = dtype.construct_array_type() 395 return arr_cls._from_sequence(self, dtype=dtyp (Pdb) dtype dtype('O') (Pdb) is_object_dtype(dtype) True (Pdb) w <stdin>(1)<module>() /home/spark/spark/python/pyspark/pandas/indexes/base.py(2770)__repr_ -> pindex = self._psdf._get_or_create_repr_pandas_cache(max_display_co /home/spark/spark/python/pyspark/pandas/frame.py(12780)_get_or_creat -> self, "_repr_pandas_cache", {n: self.head(n + 1)._to_internal_panda /home/spark/spark/python/pyspark/pandas/frame.py(12775)_to_internal_ -> return self._internal.to_pandas_frame /home/spark/spark/python/pyspark/pandas/utils.py(589)wrapped_lazy_pr -> setattr(self, attr_name, fn(self)) /home/spark/spark/python/pyspark/pandas/internal.py(1056)to_pandas_f -> pdf = sdf.toPandas() /home/spark/spark/python/pyspark/sql/pandas/conversion.py(271)toPand -> df[field.name] = _check_series_convert_timestamps_local_tz( /home/spark/spark/python/pyspark/sql/pandas/types.py(382)_check_seri -> return _check_series_convert_timestamps_localize(s, None, timezone) /home/spark/spark/python/pyspark/sql/pandas/types.py(353)_check_seri -> s.apply( /home/spark/upstream/pandas/pandas/core/series.py(4357)apply() -> return SeriesApply(self, func, convert_dtype, args, kwargs).apply() /home/spark/upstream/pandas/pandas/core/apply.py(1043)apply() -> return self.apply_standard() /home/spark/upstream/pandas/pandas/core/apply.py(1092)apply_standard -> values = obj.astype(object)._values /home/spark/upstream/pandas/pandas/core/generic.py(5815)astype() -> new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) /home/spark/upstream/pandas/pandas/core/internals/managers.py(418)as -> return self.apply("astype", dtype=dtype, copy=copy, errors=errors) /home/spark/upstream/pandas/pandas/core/internals/managers.py(327)ap -> applied = getattr(b, f)(**kwargs) /home/spark/upstream/pandas/pandas/core/internals/blocks.py(591)asty -> new_values = astype_array_safe(values, dtype, copy=copy, errors=err /home/spark/upstream/pandas/pandas/core/dtypes/cast.py(1309)astype_a -> new_values = astype_array(values, dtype, copy=copy) /home/spark/upstream/pandas/pandas/core/dtypes/cast.py(1254)astype_a -> values = values.astype(dtype, copy=copy) /home/spark/upstream/pandas/pandas/core/arrays/datetimes.py(646)asty -> return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) > /home/spark/upstream/pandas/pandas/core/arrays/datetimelike.py(390)a -> if is_object_dtype(dtype): (Pdb) n ``` pandas main(master) branch ``` >>> from pyspark import pandas as ps WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It icutor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you bu >>> a = ps.DatetimeIndex(['1970-01-01', '1970-01-01', '1970-01-01']) /home/spark/spark/python/pyspark/pandas/internal.py:1573: FutureWarning: iternstead. fields = [ > /home/spark/upstream/pandas/pandas/core/arrays/datetimelike.py(430)astype() -> dtype = pandas_dtype(dtype) (Pdb) dtype dtype('O') (Pdb) c /home/spark/spark/python/pyspark/sql/pandas/conversion.py:486: FutureWarning:ems instead. for column, series in pdf.iteritems(): >>> a > /home/spark/upstream/pandas/pandas/core/arrays/datetimelike.py(430)astype() -> dtype = pandas_dtype(dtype) (Pdb) dtype dtype('<M8') (Pdb) c Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/spark/spark/python/pyspark/pandas/indexes/base.py", line 2770, pindex = self._psdf._get_or_create_repr_pandas_cache(max_display_count).i File "/home/spark/spark/python/pyspark/pandas/frame.py", line 12780, in _ge self, "_repr_pandas_cache", {n: self.head(n + 1)._to_internal_pandas()} File "/home/spark/spark/python/pyspark/pandas/frame.py", line 12775, in _to return self._internal.to_pandas_frame File "/home/spark/spark/python/pyspark/pandas/utils.py", line 589, in wrapp setattr(self, attr_name, fn(self)) File "/home/spark/spark/python/pyspark/pandas/internal.py", line 1056, in t pdf = sdf.toPandas() File "/home/spark/spark/python/pyspark/sql/pandas/conversion.py", line 248, series = series.astype(t, copy=False) File "/home/spark/upstream/pandas/pandas/core/generic.py", line 6095, in as new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) File "/home/spark/upstream/pandas/pandas/core/internals/managers.py", line return self.apply("astype", dtype=dtype, copy=copy, errors=errors) File "/home/spark/upstream/pandas/pandas/core/internals/managers.py", line applied = getattr(b, f)(**kwargs) File "/home/spark/upstream/pandas/pandas/core/internals/blocks.py", line 52 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) File "/home/spark/upstream/pandas/pandas/core/dtypes/astype.py", line 299, new_values = astype_array(values, dtype, copy=copy) File "/home/spark/upstream/pandas/pandas/core/dtypes/astype.py", line 227, values = values.astype(dtype, copy=copy) File "/home/spark/upstream/pandas/pandas/core/arrays/datetimes.py", line 63 return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) File "/home/spark/upstream/pandas/pandas/core/arrays/datetimelike.py", line dtype = pandas_dtype(dtype) TypeError: Cannot cast DatetimeArray to dtype datetime64 >>> ``` I will debug further for locating the root cause. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
