pitrou commented on a change in pull request #8008:
URL: https://github.com/apache/arrow/pull/8008#discussion_r476391155
##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -903,6 +897,75 @@ class FixedSizeListConverter : public
BaseListConverter<FixedSizeListType, null_
int64_t list_size_;
};
+// ----------------------------------------------------------------------
+// Convert dictionary
+
+template <typename ValueType, NullCoding null_coding>
+class DictionaryConverter : public SeqConverter {
+ public:
+ using BuilderType = DictionaryBuilder<ValueType>;
+
+ Status Init(ArrayBuilder* builder) override {
+ RETURN_NOT_OK(SeqConverter::Init(builder));
+ typed_builder_ = checked_cast<BuilderType*>(builder);
+ return Status::OK();
+ }
+
+ Status Append(PyObject* obj) override {
+ // Append null if the obj is None or pandas null otherwise the valid value
+ return NullChecker<null_coding>::Check(obj) ? AppendNull() :
AppendValue(obj);
+ }
+
+ protected:
+ BuilderType* typed_builder_;
+};
+
+template <typename ValueType, NullCoding null_coding>
+class PrimitiveDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(auto value,
ValueConverter<ValueType>::FromPython(obj));
+ return this->typed_builder_->Append(value);
+ }
+};
+
+template <typename ValueType, NullCoding null_coding>
+class BinaryLikeDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(string_view_,
ValueConverter<ValueType>::FromPython(obj));
+ // DCHECK_GE(string_view_.size, 0);
+ RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes,
Review comment:
Isn't there a `TypedBuilder::Append(string_view)`? If not, can you
please add it?
##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -903,6 +897,75 @@ class FixedSizeListConverter : public
BaseListConverter<FixedSizeListType, null_
int64_t list_size_;
};
+// ----------------------------------------------------------------------
+// Convert dictionary
+
+template <typename ValueType, NullCoding null_coding>
+class DictionaryConverter : public SeqConverter {
+ public:
+ using BuilderType = DictionaryBuilder<ValueType>;
+
+ Status Init(ArrayBuilder* builder) override {
+ RETURN_NOT_OK(SeqConverter::Init(builder));
+ typed_builder_ = checked_cast<BuilderType*>(builder);
+ return Status::OK();
+ }
+
+ Status Append(PyObject* obj) override {
+ // Append null if the obj is None or pandas null otherwise the valid value
+ return NullChecker<null_coding>::Check(obj) ? AppendNull() :
AppendValue(obj);
+ }
+
+ protected:
+ BuilderType* typed_builder_;
+};
+
+template <typename ValueType, NullCoding null_coding>
+class PrimitiveDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(auto value,
ValueConverter<ValueType>::FromPython(obj));
+ return this->typed_builder_->Append(value);
+ }
+};
+
+template <typename ValueType, NullCoding null_coding>
+class BinaryLikeDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(string_view_,
ValueConverter<ValueType>::FromPython(obj));
+ // DCHECK_GE(string_view_.size, 0);
Review comment:
Why?
##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -1123,6 +1186,53 @@ class DecimalConverter : public
TypedConverter<arrow::Decimal128Type, null_codin
std::shared_ptr<DecimalType> decimal_type_;
};
+#define DICTIONARY_PRIMITIVE(TYPE_ENUM, TYPE_CLASS) \
Review comment:
Please `#undef` at the end.
##########
File path: python/pyarrow/tests/test_scalars.py
##########
@@ -550,8 +550,8 @@ def test_dictionary():
with pytest.warns(FutureWarning):
assert s.dictionary_value.as_py() == v
- with pytest.raises(pa.ArrowNotImplementedError):
- pickle.loads(pickle.dumps(s))
+ restored = pickle.loads(pickle.dumps(s))
+ assert restored.equals(s)
Review comment:
This should be inside the loop.
(also, null values are not tested?)
##########
File path: python/pyarrow/tests/test_convert_builtin.py
##########
@@ -1640,3 +1640,85 @@ def test_map_from_tuples():
for entry in [[(5,)], [()], [('5', 'foo', True)]]:
with pytest.raises(ValueError, match="(?i)tuple size"):
pa.array([entry], type=pa.map_('i4', 'i4'))
+
+
+def test_dictionary_from_boolean():
+ typ = pa.dictionary(pa.int8(), value_type=pa.bool_())
+ a = pa.array([False, False, True, False, True], type=typ)
+ assert isinstance(a.type, pa.DictionaryType)
+ assert a.type.equals(typ)
+
+ expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8())
+ expected_dictionary = pa.array([False, True], type=pa.bool_())
+ assert a.indices.equals(expected_indices)
+ assert a.dictionary.equals(expected_dictionary)
+
+
[email protected]('value_type', [
+ pa.int8(),
+ pa.int16(),
+ pa.int32(),
+ pa.int64(),
+ pa.uint8(),
+ pa.uint16(),
+ pa.uint32(),
+ pa.uint64(),
+ pa.float32(),
+ pa.float64(),
+ pa.date32(),
+ pa.date64(),
+])
+def test_dictionary_from_integers(value_type):
+ typ = pa.dictionary(pa.int8(), value_type=value_type)
+ a = pa.array([1, 2, 1, 1, 2, 3], type=typ)
+ assert isinstance(a.type, pa.DictionaryType)
+ assert a.type.equals(typ)
+
+ expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8())
+ expected_dictionary = pa.array([1, 2, 3], type=value_type)
+ assert a.indices.equals(expected_indices)
+ assert a.dictionary.equals(expected_dictionary)
+
+
[email protected]('input_index_type', [
+ pa.int8(),
+ pa.int16(),
+ pa.int32(),
+ pa.int64()
+])
+def test_dictionary_is_always_adaptive(input_index_type):
+ # dictionary array is constructed using adaptive index type builder,
+ # meaning that the input index type is ignored since the output index
+ # type depends on the input data
+ typ = pa.dictionary(input_index_type, value_type=pa.int64())
+
+ a = pa.array(range(2**7), type=typ)
+ expected = pa.dictionary(pa.int8(), pa.int64())
+ assert a.type.equals(expected)
+
+ a = pa.array(range(2**7 + 1), type=typ)
+ expected = pa.dictionary(pa.int16(), pa.int64())
+ assert a.type.equals(expected)
+
+
+def test_dictionary_from_strings():
+ for value_type in [pa.binary(), pa.string()]:
+ typ = pa.dictionary(pa.int8(), value_type)
+ a = pa.array(["", "a", "bb", "a", "bb", "ccc"], type=typ)
Review comment:
Also test with nulls?
##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -903,6 +897,75 @@ class FixedSizeListConverter : public
BaseListConverter<FixedSizeListType, null_
int64_t list_size_;
};
+// ----------------------------------------------------------------------
+// Convert dictionary
+
+template <typename ValueType, NullCoding null_coding>
+class DictionaryConverter : public SeqConverter {
+ public:
+ using BuilderType = DictionaryBuilder<ValueType>;
+
+ Status Init(ArrayBuilder* builder) override {
+ RETURN_NOT_OK(SeqConverter::Init(builder));
+ typed_builder_ = checked_cast<BuilderType*>(builder);
+ return Status::OK();
+ }
+
+ Status Append(PyObject* obj) override {
+ // Append null if the obj is None or pandas null otherwise the valid value
+ return NullChecker<null_coding>::Check(obj) ? AppendNull() :
AppendValue(obj);
+ }
+
+ protected:
+ BuilderType* typed_builder_;
+};
+
+template <typename ValueType, NullCoding null_coding>
+class PrimitiveDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(auto value,
ValueConverter<ValueType>::FromPython(obj));
+ return this->typed_builder_->Append(value);
+ }
+};
+
+template <typename ValueType, NullCoding null_coding>
+class BinaryLikeDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(string_view_,
ValueConverter<ValueType>::FromPython(obj));
+ // DCHECK_GE(string_view_.size, 0);
+ RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes,
+
static_cast<int32_t>(string_view_.size)));
+ return Status::OK();
+ }
+
+ protected:
+ // Create a single instance of PyBytesView here to prevent unnecessary object
+ // creation/destruction
+ PyBytesView string_view_;
+};
+
+template <NullCoding null_coding>
+class FixedSizeBinaryDictionaryConverter
+ : public DictionaryConverter<FixedSizeBinaryType, null_coding> {
+ public:
+ explicit FixedSizeBinaryDictionaryConverter(int32_t byte_width)
+ : byte_width_(byte_width) {}
+
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(
+ string_view_, ValueConverter<FixedSizeBinaryType>::FromPython(obj,
byte_width_));
+ RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes,
+
static_cast<int32_t>(string_view_.size)));
+ return Status::OK();
+ }
+
+ protected:
+ int32_t byte_width_;
Review comment:
Add `const`
##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -903,6 +897,75 @@ class FixedSizeListConverter : public
BaseListConverter<FixedSizeListType, null_
int64_t list_size_;
};
+// ----------------------------------------------------------------------
+// Convert dictionary
+
+template <typename ValueType, NullCoding null_coding>
+class DictionaryConverter : public SeqConverter {
+ public:
+ using BuilderType = DictionaryBuilder<ValueType>;
+
+ Status Init(ArrayBuilder* builder) override {
+ RETURN_NOT_OK(SeqConverter::Init(builder));
+ typed_builder_ = checked_cast<BuilderType*>(builder);
+ return Status::OK();
+ }
+
+ Status Append(PyObject* obj) override {
+ // Append null if the obj is None or pandas null otherwise the valid value
+ return NullChecker<null_coding>::Check(obj) ? AppendNull() :
AppendValue(obj);
+ }
+
+ protected:
+ BuilderType* typed_builder_;
+};
+
+template <typename ValueType, NullCoding null_coding>
+class PrimitiveDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(auto value,
ValueConverter<ValueType>::FromPython(obj));
+ return this->typed_builder_->Append(value);
+ }
+};
+
+template <typename ValueType, NullCoding null_coding>
+class BinaryLikeDictionaryConverter : public DictionaryConverter<ValueType,
null_coding> {
+ public:
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(string_view_,
ValueConverter<ValueType>::FromPython(obj));
+ // DCHECK_GE(string_view_.size, 0);
+ RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes,
+
static_cast<int32_t>(string_view_.size)));
+ return Status::OK();
+ }
+
+ protected:
+ // Create a single instance of PyBytesView here to prevent unnecessary object
+ // creation/destruction
+ PyBytesView string_view_;
+};
+
+template <NullCoding null_coding>
+class FixedSizeBinaryDictionaryConverter
+ : public DictionaryConverter<FixedSizeBinaryType, null_coding> {
+ public:
+ explicit FixedSizeBinaryDictionaryConverter(int32_t byte_width)
+ : byte_width_(byte_width) {}
+
+ Status AppendValue(PyObject* obj) override {
+ ARROW_ASSIGN_OR_RAISE(
+ string_view_, ValueConverter<FixedSizeBinaryType>::FromPython(obj,
byte_width_));
+ RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes,
Review comment:
Same here: it would be good to have `TypedBuilder::Append(string_view)`.
##########
File path: python/pyarrow/tests/test_convert_builtin.py
##########
@@ -1640,3 +1640,85 @@ def test_map_from_tuples():
for entry in [[(5,)], [()], [('5', 'foo', True)]]:
with pytest.raises(ValueError, match="(?i)tuple size"):
pa.array([entry], type=pa.map_('i4', 'i4'))
+
+
+def test_dictionary_from_boolean():
+ typ = pa.dictionary(pa.int8(), value_type=pa.bool_())
+ a = pa.array([False, False, True, False, True], type=typ)
+ assert isinstance(a.type, pa.DictionaryType)
+ assert a.type.equals(typ)
+
+ expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8())
+ expected_dictionary = pa.array([False, True], type=pa.bool_())
+ assert a.indices.equals(expected_indices)
+ assert a.dictionary.equals(expected_dictionary)
+
+
[email protected]('value_type', [
+ pa.int8(),
+ pa.int16(),
+ pa.int32(),
+ pa.int64(),
+ pa.uint8(),
+ pa.uint16(),
+ pa.uint32(),
+ pa.uint64(),
+ pa.float32(),
+ pa.float64(),
+ pa.date32(),
+ pa.date64(),
+])
+def test_dictionary_from_integers(value_type):
+ typ = pa.dictionary(pa.int8(), value_type=value_type)
+ a = pa.array([1, 2, 1, 1, 2, 3], type=typ)
+ assert isinstance(a.type, pa.DictionaryType)
+ assert a.type.equals(typ)
+
+ expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8())
Review comment:
Can you test with nulls at some point?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]