Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/18945#discussion_r140143875
--- Diff: python/pyspark/sql/dataframe.py ---
@@ -1761,12 +1761,37 @@ def toPandas(self):
raise ImportError("%s\n%s" % (e.message, msg))
else:
dtype = {}
+ columns_with_null_int = {}
+ def null_handler(rows, columns_with_null_int):
+ for row in rows:
+ row = row.asDict()
+ for column in columns_with_null_int:
+ val = row[column]
+ dt = dtype[column]
+ if val is not None:
+ if abs(val) > 16777216: # Max value before
np.float32 loses precision.
+ val = np.float64(val)
+ if np.float64 != dt:
+ dt = np.float64
+ dtype[column] = np.float64
+ else:
+ val = np.float32(val)
+ if dt not in (np.float32, np.float64):
+ dt = np.float32
+ dtype[column] = np.float32
+ row[column] = val
+ row = Row(**row)
+ yield row
+ row_handler = lambda x,y: x
for field in self.schema:
pandas_type = _to_corrected_pandas_type(field.dataType)
+ if pandas_type in (np.int8, np.int16, np.int32) and
field.nullable:
+ columns_with_null_int.add(field.name)
--- End diff --
>>> columns_with_null_int = {}
>>> columns_with_null_int.add("test")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'dict' object has no attribute 'add'
Am I missing something?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]