jorisvandenbossche commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r488714572
##########
File path: python/pyarrow/tests/test_types.py
##########
@@ -280,6 +284,13 @@ def test_tzinfo_to_string_errors():
pa.lib.tzinfo_to_string(tz)
[email protected](tzst.timezones())
+def test_timezone_roundtrip(tz):
Review comment:
this is specifically `pytz` timezone roundtrip, can you clarify that in
the test name or a comment?
##########
File path: python/pyarrow/array.pxi
##########
@@ -21,28 +21,28 @@ import warnings
cdef _sequence_to_array(object sequence, object mask, object size,
DataType type, CMemoryPool* pool, c_bool from_pandas):
- cdef int64_t c_size
- cdef PyConversionOptions options
+ cdef:
+ int64_t c_size
+ PyConversionOptions options
+ shared_ptr[CChunkedArray] chunked
if type is not None:
options.type = type.sp_type
if size is not None:
options.size = size
- options.pool = pool
options.from_pandas = from_pandas
- options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
Review comment:
What's the reason this is being removed? It's handled elsewhere now?
##########
File path: python/pyarrow/scalar.pxi
##########
@@ -610,12 +609,10 @@ cdef class StructScalar(Scalar, collections.abc.Mapping):
def __getitem__(self, key):
"""
Return the child value for the given field.
-
Review comment:
this whitespace was removed by accident? (in any case rst requires an
empty line here) Same fore a few lines below
##########
File path: python/pyarrow/array.pxi
##########
@@ -158,24 +158,44 @@ def array(object obj, type=None, mask=None, size=None,
from_pandas=None,
Notes
-----
Localized timestamps will currently be returned as UTC (pandas's native
- representation). Timezone-naive data will be implicitly interpreted as
+ representation). Timezone-naive data will be implicitly interpreted as
UTC.
+ Converting to dictionary array will promote to a wider integer type for
+ indices if the number of distinct values cannot be represented, even if
+ the index type was explicitly set. This means that if there are more than
+ 127 values the returned dictionary array's index type will be at least
+ pa.int16() even if pa.int8() was passed to the function. Note that an
+ explicit index type will not be demoted even if it is wider than required.
+
Examples
--------
>>> import pandas as pd
>>> import pyarrow as pa
>>> pa.array(pd.Series([1, 2]))
- <pyarrow.array.Int64Array object at 0x7f674e4c0e10>
+ <pyarrow.lib.Int64Array object at 0x7f674e4c0e10>
[
1,
2
]
+ >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string()))
+ <pyarrow.lib.DictionaryArray object at 0x7feb288d9040>
+ -- dictionary:
+ [
+ "a",
+ "b"
+ ]
+ -- indices:
+ [
+ 0,
+ 1,
+ 0
+ ]
+
>>> import numpy as np
- >>> pa.array(pd.Series([1, 2]), np.array([0, 1],
- ... dtype=bool))
- <pyarrow.array.Int64Array object at 0x7f9019e11208>
+ >>> pa.array(pd.Series([1, 2]), np.array([0, 1], dtype=bool))
Review comment:
```suggestion
>>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool))
```
this example actually doesn't work otherwise
##########
File path: python/pyarrow/tests/test_convert_builtin.py
##########
@@ -1513,6 +1519,108 @@ def test_struct_from_tuples():
pa.array([tup], type=ty)
+def test_struct_from_list_of_pairs():
+ ty = pa.struct([
+ pa.field('a', pa.int32()),
+ pa.field('b', pa.string()),
+ pa.field('c', pa.bool_())
+ ])
+ data = [
+ [('a', 5), ('b', 'foo'), ('c', True)],
+ [('a', 6), ('b', 'bar'), ('c', False)],
+ None
+ ]
+ arr = pa.array(data, type=ty)
+ assert arr.to_pylist() == [
+ {'a': 5, 'b': 'foo', 'c': True},
+ {'a': 6, 'b': 'bar', 'c': False},
+ None
+ ]
+
+ # test with duplicated field names
+ ty = pa.struct([
+ pa.field('a', pa.int32()),
+ pa.field('a', pa.string()),
+ pa.field('b', pa.bool_())
+ ])
+ data = [
+ [('a', 5), ('a', 'foo'), ('b', True)],
+ [('a', 6), ('a', 'bar'), ('b', False)],
+ ]
+ arr = pa.array(data, type=ty)
+ with pytest.raises(KeyError):
+ # TODO(kszucs): ARROW-9997
+ arr.to_pylist()
+
+ # test with empty elements
+ ty = pa.struct([
+ pa.field('a', pa.int32()),
+ pa.field('b', pa.string()),
+ pa.field('c', pa.bool_())
+ ])
+ data = [
+ [],
+ [('a', 5), ('b', 'foo'), ('c', True)],
+ [('a', 2), ('b', 'baz')],
+ [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')],
Review comment:
It doesn't raise on "d" being present here?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]