Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/20678#discussion_r170792103
--- Diff: python/pyspark/sql/dataframe.py ---
@@ -1986,55 +1986,89 @@ def toPandas(self):
timezone = None
if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled",
"false").lower() == "true":
+ should_fallback = False
try:
- from pyspark.sql.types import
_check_dataframe_convert_date, \
- _check_dataframe_localize_timestamps, to_arrow_schema
+ from pyspark.sql.types import to_arrow_schema
from pyspark.sql.utils import
require_minimum_pyarrow_version
+
require_minimum_pyarrow_version()
- import pyarrow
to_arrow_schema(self.schema)
- tables = self._collectAsArrow()
- if tables:
- table = pyarrow.concat_tables(tables)
- pdf = table.to_pandas()
- pdf = _check_dataframe_convert_date(pdf, self.schema)
- return _check_dataframe_localize_timestamps(pdf,
timezone)
- else:
- return pd.DataFrame.from_records([],
columns=self.columns)
except Exception as e:
- msg = (
- "Note: toPandas attempted Arrow optimization because "
- "'spark.sql.execution.arrow.enabled' is set to true.
Please set it to false "
- "to disable this.")
- raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
- else:
- pdf = pd.DataFrame.from_records(self.collect(),
columns=self.columns)
- dtype = {}
+ if
self.sql_ctx.getConf("spark.sql.execution.arrow.fallback.enabled", "false") \
--- End diff --
We should use the same default value `"true"` as the default value defined
in `SQLConf`.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]