Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/18945#discussion_r140143875 --- Diff: python/pyspark/sql/dataframe.py --- @@ -1761,12 +1761,37 @@ def toPandas(self): raise ImportError("%s\n%s" % (e.message, msg)) else: dtype = {} + columns_with_null_int = {} + def null_handler(rows, columns_with_null_int): + for row in rows: + row = row.asDict() + for column in columns_with_null_int: + val = row[column] + dt = dtype[column] + if val is not None: + if abs(val) > 16777216: # Max value before np.float32 loses precision. + val = np.float64(val) + if np.float64 != dt: + dt = np.float64 + dtype[column] = np.float64 + else: + val = np.float32(val) + if dt not in (np.float32, np.float64): + dt = np.float32 + dtype[column] = np.float32 + row[column] = val + row = Row(**row) + yield row + row_handler = lambda x,y: x for field in self.schema: pandas_type = _to_corrected_pandas_type(field.dataType) + if pandas_type in (np.int8, np.int16, np.int32) and field.nullable: + columns_with_null_int.add(field.name) --- End diff -- >>> columns_with_null_int = {} >>> columns_with_null_int.add("test") Traceback (most recent call last): File "<stdin>", line 1, in <module> AttributeError: 'dict' object has no attribute 'add' Am I missing something?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org