[
https://issues.apache.org/jira/browse/ARROW-8115?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Ilya Orson Sandoval updated ARROW-8115:
---------------------------------------
Description:
h3. Code Sample
```python
import pandas as pd
df = pd.DataFrame({"date": ["", "2019-05-01"]})
df.date = pd.to_datetime(df.date).dt.date
df.to_parquet("issue_NaT_parquet")
```
h3. Problem description
The above gives me the following error:
{quote}---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-13-432405bef6ac> in <module>
----> 1 df.to_parquet("test.parquet")
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\util\_decorators.py
in wrapper(*args, **kwargs)
212 else:
213 kwargs[new_arg_name] = new_arg_value
--> 214 return func(*args, **kwargs)
215
216 return cast(F, wrapper)
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py in
to_parquet(self, path, engine, compression, index, partition_cols, **kwargs)
2114 index=index,
2115 partition_cols=partition_cols,
-> 2116 **kwargs,
2117 )
2118
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\io\parquet.py in
to_parquet(df, path, engine, compression, index, partition_cols, **kwargs)
262 index=index,
263 partition_cols=partition_cols,
--> 264 **kwargs,
265 )
266
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\io\parquet.py in
write(self, df, path, compression, coerce_timestamps, index, partition_cols,
**kwargs)
99 from_pandas_kwargs["preserve_index"] = index
100
--> 101 table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
102 if partition_cols is not None:
103 self.api.parquet.write_to_dataset(
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\table.pxi in
pyarrow.lib.Table.from_pandas()
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
573 if nthreads == 1:
574 arrays = [convert_column(c, f)
--> 575 for c, f in zip(columns_to_convert, convert_fields)]
576 else:
577 from concurrent import futures
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
in <listcomp>(.0)
573 if nthreads == 1:
574 arrays = [convert_column(c, f)
--> 575 for c, f in zip(columns_to_convert, convert_fields)]
576 else:
577 from concurrent import futures
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
in convert_column(col, field)
558
559 try:
--> 560 result = pa.array(col, type=type_, from_pandas=True,
safe=safe)
561 except (pa.ArrowInvalid,
562 pa.ArrowNotImplementedError,
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\array.pxi in
pyarrow.lib.array()
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\array.pxi in
pyarrow.lib._ndarray_to_array()
TypeError: an integer is required (got type datetime.date){quote}
h3. Expected Output
Parquet with null values mixed with date values.
was:
#### Code Sample
```python
import pandas as pd
df = pd.DataFrame({"date": ["", "2019-05-01"]})
df.date = pd.to_datetime(df.date).dt.date
df.to_parquet("issue_NaT_parquet")
```
#### Problem description
The above gives me the following error:
<details>
> ---------------------------------------------------------------------------
> TypeError Traceback (most recent call last)
> <ipython-input-11-432405bef6ac> in <module>
> ----> 1 df.to_parquet("test.parquet")
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\util\_decorators.py
> in wrapper(*args, **kwargs)
> 212 else:
> 213 kwargs[new_arg_name] = new_arg_value
> --> 214 return func(*args, **kwargs)
> 215
> 216 return cast(F, wrapper)
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py
> in to_parquet(self, path, engine, compression, index, partition_cols,
> **kwargs)
>
> 2114 index=index,
> 2115 partition_cols=partition_cols,
> -> 2116 **kwargs,
> 2117 )
> 2118
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\io\parquet.py
> in to_parquet(df, path, engine, compression, index, partition_cols, **kwargs)
> 262 index=index,
> 263 partition_cols=partition_cols,
> --> 264 **kwargs,
> 265 )
> 266
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\io\parquet.py
> in write(self, df, path, compression, coerce_timestamps, index,
> partition_cols,
> **kwargs)
> 99 from_pandas_kwargs["preserve_index"] = index
> 100
> --> 101 table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
> 102 if partition_cols is not None:
> 103 self.api.parquet.write_to_dataset(
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\table.pxi in
> pyarrow.lib.Table.from_pandas()
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
> in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
>
> 573 if nthreads == 1:
> 574 arrays = [convert_column(c, f)
> --> 575 for c, f in zip(columns_to_convert, convert_fields)]
> 576 else:
> 577 from concurrent import futures
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
> in <listcomp>(.0)
> 573 if nthreads == 1:
> 574 arrays = [convert_column(c, f)
> --> 575 for c, f in zip(columns_to_convert, convert_fields)]
> 576 else:
> 577 from concurrent import futures
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
> in convert_column(col, field)
> 558
> 559 try:
> --> 560 result = pa.array(col, type=type_, from_pandas=True,
> safe=safe)
> 561 except (pa.ArrowInvalid,
> 562 pa.ArrowNotImplementedError,
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\array.pxi in
> pyarrow.lib.array()
>
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\array.pxi in
> pyarrow.lib._ndarray_to_array()
>
> TypeError: an integer is required (got type datetime.date)
</details>
#### Expected Output
Parquet with null values mixed with date values.
#### Output of ``pd.show_versions()``
> [Python] NaT not handled correctly when writing
> -----------------------------------------------
>
> Key: ARROW-8115
> URL: https://issues.apache.org/jira/browse/ARROW-8115
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Environment: <details>
> [paste the output of ``pd.show_versions()`` here below this line]
> INSTALLED VERSIONS
> ------------------
> commit : None
> python : 3.7.4.final.0
> python-bits : 64
> OS : Windows
> OS-release : 10
> machine : AMD64
> processor : Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
> byteorder : little
> LC_ALL : None
> LANG : None
> LOCALE : None.None
> pandas : 1.0.1
> numpy : 1.18.1
> pytz : 2019.3
> dateutil : 2.8.0
> pip : 20.0.2
> setuptools : 45.2.0.post20200210
> Cython : None
> pytest : 5.3.5
> hypothesis : None
> sphinx : None
> blosc : None
> feather : None
> xlsxwriter : None
> lxml.etree : 4.4.2
> html5lib : 1.0.1
> pymysql : None
> psycopg2 : None
> jinja2 : 2.11.1
> IPython : 7.12.0
> pandas_datareader: None
> bs4 : 4.8.1
> bottleneck : None
> fastparquet : None
> gcsfs : None
> lxml.etree : 4.4.2
> matplotlib : 3.1.2
> numexpr : None
> odfpy : None
> openpyxl : 3.0.3
> pandas_gbq : None
> pyarrow : 0.16.0
> pytables : None
> pytest : 5.3.5
> pyxlsb : None
> s3fs : 0.4.0
> scipy : None
> sqlalchemy : None
> tables : None
> tabulate : None
> xarray : None
> xlrd : 1.2.0
> xlwt : None
> xlsxwriter : None
> numba : None
> </details>
> Reporter: Ilya Orson Sandoval
> Priority: Major
>
> h3. Code Sample
> ```python
> import pandas as pd
> df = pd.DataFrame({"date": ["", "2019-05-01"]})
> df.date = pd.to_datetime(df.date).dt.date
> df.to_parquet("issue_NaT_parquet")
> ```
> h3. Problem description
> The above gives me the following error:
> {quote}---------------------------------------------------------------------------
> TypeError Traceback (most recent call last)
> <ipython-input-13-432405bef6ac> in <module>
> ----> 1 df.to_parquet("test.parquet")
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\util\_decorators.py
> in wrapper(*args, **kwargs)
> 212 else:
> 213 kwargs[new_arg_name] = new_arg_value
> --> 214 return func(*args, **kwargs)
> 215
> 216 return cast(F, wrapper)
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py
> in to_parquet(self, path, engine, compression, index, partition_cols,
> **kwargs)
> 2114 index=index,
> 2115 partition_cols=partition_cols,
> -> 2116 **kwargs,
> 2117 )
> 2118
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\io\parquet.py
> in to_parquet(df, path, engine, compression, index, partition_cols, **kwargs)
> 262 index=index,
> 263 partition_cols=partition_cols,
> --> 264 **kwargs,
> 265 )
> 266
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\io\parquet.py
> in write(self, df, path, compression, coerce_timestamps, index,
> partition_cols, **kwargs)
> 99 from_pandas_kwargs["preserve_index"] = index
> 100
> --> 101 table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
> 102 if partition_cols is not None:
> 103 self.api.parquet.write_to_dataset(
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\table.pxi in
> pyarrow.lib.Table.from_pandas()
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
> in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
> 573 if nthreads == 1:
> 574 arrays = [convert_column(c, f)
> --> 575 for c, f in zip(columns_to_convert, convert_fields)]
> 576 else:
> 577 from concurrent import futures
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
> in <listcomp>(.0)
> 573 if nthreads == 1:
> 574 arrays = [convert_column(c, f)
> --> 575 for c, f in zip(columns_to_convert, convert_fields)]
> 576 else:
> 577 from concurrent import futures
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\pandas_compat.py
> in convert_column(col, field)
> 558
> 559 try:
> --> 560 result = pa.array(col, type=type_, from_pandas=True,
> safe=safe)
> 561 except (pa.ArrowInvalid,
> 562 pa.ArrowNotImplementedError,
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\array.pxi in
> pyarrow.lib.array()
> ~\AppData\Local\Continuum\miniconda3\lib\site-packages\pyarrow\array.pxi in
> pyarrow.lib._ndarray_to_array()
> TypeError: an integer is required (got type datetime.date){quote}
> h3. Expected Output
> Parquet with null values mixed with date values.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)