Github user BryanCutler commented on a diff in the pull request:
https://github.com/apache/spark/pull/19646#discussion_r148707442
--- Diff: python/pyspark/sql/session.py ---
@@ -512,9 +512,39 @@ def createDataFrame(self, data, schema=None,
samplingRatio=None, verifySchema=Tr
except Exception:
has_pandas = False
if has_pandas and isinstance(data, pandas.DataFrame):
+ import numpy as np
+
+ # Convert pandas.DataFrame to list of numpy records
+ np_records = data.to_records(index=False)
+
+ # Check if any columns need to be fixed for Spark to infer
properly
+ record_type_list = None
+ if schema is None and len(np_records) > 0:
+ cur_dtypes = np_records[0].dtype
+ col_names = cur_dtypes.names
+ record_type_list = []
+ has_rec_fix = False
+ for i in xrange(len(cur_dtypes)):
--- End diff --
Ooops, I forgot about that. thx!
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]