Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/19646#discussion_r148773536
--- Diff: python/pyspark/sql/session.py ---
@@ -416,6 +417,50 @@ def _createFromLocal(self, data, schema):
data = [schema.toInternal(row) for row in data]
return self._sc.parallelize(data), schema
+ def _getNumpyRecordDtypes(self, rec):
+ """
+ Used when converting a pandas.DataFrame to Spark using
to_records(), this will correct
+ the dtypes of records so they can be properly loaded into Spark.
+ :param rec: a numpy record to check dtypes
+ :return corrected dtypes for a numpy.record or None if no
correction needed
+ """
+ import numpy as np
+ cur_dtypes = rec.dtype
+ col_names = cur_dtypes.names
+ record_type_list = []
+ has_rec_fix = False
+ for i in xrange(len(cur_dtypes)):
+ curr_type = cur_dtypes[i]
+ # If type is a datetime64 timestamp, convert to microseconds
+ # NOTE: if dtype is M8[ns] then np.record.tolist() will output
values as longs,
+ # this conversion will lead to an output of py datetime
objects, see SPARK-22417
+ if curr_type == np.dtype('M8[ns]'):
+ curr_type = 'M8[us]'
+ has_rec_fix = True
+ record_type_list.append((str(col_names[i]), curr_type))
+ return record_type_list if has_rec_fix else None
+
+ def _convertFromPandas(self, pdf, schema):
+ """
+ Convert a pandas.DataFrame to list of records that can be used to
make a DataFrame
+ :return tuple of list of records and schema
+ """
+ # Convert pandas.DataFrame to list of numpy records
+ np_records = pdf.to_records(index=False)
--- End diff --
I got:
```python
>>> pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1,
1)]}).to_records(index=False)[0].tolist()[0]
1509411661000000000L
>>> pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1,
1)]}).to_records(index=False)[0][0]
numpy.datetime64('2017-10-31T01:01:01.000000000')
```
whereas:
```python
>>> pd.DataFrame({"d":
[pd.Timestamp.now().date()]}).to_records(index=False)[0].tolist()[0]
datetime.date(2017, 11, 3)
>>> pd.DataFrame({"d":
[pd.Timestamp.now().date()]}).to_records(index=False)[0][0]
datetime.date(2017, 11, 3)
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]