This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6523f51 feat: Add dictionary support in integration test utility
(#342)
6523f51 is described below
commit 6523f511799537ac1dc3b210d8e804caff02edd9
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Jan 9 15:37:49 2024 +0000
feat: Add dictionary support in integration test utility (#342)
This PR implements dictionary support in the integration test utility
and fixes a few problems identified with integration testing to ensure
that it actually works end-to-end (via
https://github.com/apache/arrow/pull/39302 ). The changes are:
- Batches that contain dictionaries can now be read, written, and
validated using integration testing JSON
- Fixed an issue in the integration test library (anything other than
the first batch previously segfaulted)
- Improved const correctness of nanoarrow.hpp (because dictionaries
required a `std::unordered_map<>` with a `UniqueSchema` and a few const
overloads were missing)
- Fixed the nullability of the top-level batch to match Arrow C++ output
- Fixed the null count of exported arrays (previously they were all
exported as having zero nulls)
It can now be tested with `archery` (after checking out
https://github.com/apache/arrow/pull/39302 ):
```
export
ARROW_CPP_EXE_PATH=/Users/deweydunnington/.r-arrow-dev-build/build/debug
export ARROW_NANOARROW_PATH=/path/to/arrow-nanoarrow/build
archery integration --with-cpp=true --with-nanoarrow=true --run-c-data
```
The current failures are limited to the remaining unimplemented types
(datetime types and decimal).
And for future me or anybody who has to/wants to launch a debugger with
a segfaulting integration test in VSCode, it can be done with this
launch.json:
```
{
"type": "lldb",
"request": "launch",
"name": "Debug Integration Tests",
"program": "${workspaceFolder}/.venv/bin/python",
"args": ["-m", "archery.cli", "integration", "--with-cpp=true",
"--with-nanoarrow=true", "--run-c-data"],
"cwd": "${workspaceFolder}",
"env": {
"ARROW_CPP_EXE_PATH":
"/Users/deweydunnington/.r-arrow-dev-build/build/debug",
"ARROW_NANOARROW_PATH": "${workspaceFolder}/out/build/user-local"
}
}
```
---
.../src/nanoarrow/nanoarrow_ipc_decoder.c | 3 +
src/nanoarrow/array.c | 4 +-
src/nanoarrow/array_test.cc | 3 +-
src/nanoarrow/integration/c_data_integration.cc | 4 +-
src/nanoarrow/nanoarrow.hpp | 10 +-
src/nanoarrow/nanoarrow_testing.hpp | 467 ++++++++++++++++++---
src/nanoarrow/nanoarrow_testing_test.cc | 142 +++++++
7 files changed, 573 insertions(+), 60 deletions(-)
diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
index 46b6807..8a5a111 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
@@ -1123,6 +1123,9 @@ ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct
ArrowIpcDecoder* decoder,
return result;
}
+ // Top-level batch schema is typically non-nullable
+ tmp.flags = 0;
+
result = ArrowIpcDecoderSetChildren(&tmp, fields, error);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(&tmp);
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 3f24ccb..e790740 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -1164,8 +1164,8 @@ static int ArrowArrayViewValidateFull(struct
ArrowArrayView* array_view,
// Dictionary valiation not implemented
if (array_view->dictionary != NULL) {
- ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not
implemented");
- return ENOTSUP;
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary,
error));
+ // TODO: validate the indices
}
return NANOARROW_OK;
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index 0101608..739e1d8 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -2121,9 +2121,8 @@ TEST(ArrayTest, ArrayViewTestDictionary) {
EXPECT_EQ(array_view.buffer_views[1].size_bytes, 2 * sizeof(int32_t));
EXPECT_EQ(array_view.dictionary->buffer_views[2].size_bytes, 6);
- // Full validation not yet supported for dictionary
EXPECT_EQ(ArrowArrayViewValidate(&array_view,
NANOARROW_VALIDATION_LEVEL_FULL, nullptr),
- ENOTSUP);
+ NANOARROW_OK);
EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 0), 0);
EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 1), 1);
diff --git a/src/nanoarrow/integration/c_data_integration.cc
b/src/nanoarrow/integration/c_data_integration.cc
index 14a92b3..6ab09ea 100644
--- a/src/nanoarrow/integration/c_data_integration.cc
+++ b/src/nanoarrow/integration/c_data_integration.cc
@@ -159,7 +159,7 @@ static ArrowErrorCode ExportBatchFromJson(const char*
json_path, int num_batch,
MaterializedArrayStream data;
NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(json_path, &data, num_batch,
error));
- ArrowArrayMove(data.arrays[num_batch].get(), out);
+ ArrowArrayMove(data.arrays[0].get(), out);
return NANOARROW_OK;
}
@@ -173,7 +173,7 @@ static ArrowErrorCode ImportBatchAndCompareToJson(const
char* json_path, int num
nanoarrow::testing::TestingJSONComparison comparison;
NANOARROW_RETURN_NOT_OK(comparison.SetSchema(data.schema.get(), error));
NANOARROW_RETURN_NOT_OK(
- comparison.CompareBatch(actual.get(), data.arrays[num_batch].get(),
error));
+ comparison.CompareBatch(actual.get(), data.arrays[0].get(), error));
if (comparison.num_differences() > 0) {
std::stringstream ss;
comparison.WriteDifferences(ss);
diff --git a/src/nanoarrow/nanoarrow.hpp b/src/nanoarrow/nanoarrow.hpp
index 15914ce..8d5b841 100644
--- a/src/nanoarrow/nanoarrow.hpp
+++ b/src/nanoarrow/nanoarrow.hpp
@@ -206,15 +206,21 @@ class Unique {
/// \brief Move and take ownership of data wrapped by rhs
Unique(Unique&& rhs) : Unique(rhs.get()) {}
+ Unique& operator=(Unique&& rhs) {
+ reset(rhs.get());
+ return *this;
+ }
// These objects are not copyable
- Unique(Unique& rhs) = delete;
+ Unique(const Unique& rhs) = delete;
/// \brief Get a pointer to the data owned by this object
T* get() noexcept { return &data_; }
+ const T* get() const noexcept { return &data_; }
/// \brief Use the pointer operator to access fields of this object
- T* operator->() { return &data_; }
+ T* operator->() noexcept { return &data_; }
+ const T* operator->() const noexcept { return &data_; }
/// \brief Call data's release callback if valid
void reset() { release_pointer(&data_); }
diff --git a/src/nanoarrow/nanoarrow_testing.hpp
b/src/nanoarrow/nanoarrow_testing.hpp
index b62c3ca..951c215 100644
--- a/src/nanoarrow/nanoarrow_testing.hpp
+++ b/src/nanoarrow/nanoarrow_testing.hpp
@@ -15,10 +15,12 @@
// specific language governing permissions and limitations
// under the License.
+#include <algorithm>
#include <iostream>
#include <limits>
#include <sstream>
#include <string>
+#include <unordered_map>
#include <nlohmann/json.hpp>
@@ -35,6 +37,96 @@ namespace nanoarrow {
namespace testing {
+namespace internal {
+
+// Internal representation of the various structures needed to import and/or
export
+// a dictionary array. We use a serialized version of the dictionary value
because
+// nanoarrow doesn't currently have the ability to copy or reference count an
Array.
+struct Dictionary {
+ nanoarrow::UniqueSchema schema;
+ int64_t column_length;
+ std::string column_json;
+};
+
+class DictionaryContext {
+ public:
+ DictionaryContext() : next_id_(0) {}
+
+ ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema*
values_schema) {
+ if (!HasDictionaryForId(dictionary_id)) {
+ dictionaries_[dictionary_id] = internal::Dictionary();
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaDeepCopy(values_schema,
dictionaries_[dictionary_id].schema.get()));
+ }
+
+ dictionary_ids_[values_schema] = dictionary_id;
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t*
dictionary_id) {
+ while (HasDictionaryForId(next_id_)) {
+ next_id_++;
+ }
+
+ NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema));
+ *dictionary_id = next_id_++;
+ return NANOARROW_OK;
+ }
+
+ void RecordArray(int32_t dictionary_id, int64_t length, std::string
column_json) {
+ dictionaries_[dictionary_id].column_length = length;
+ dictionaries_[dictionary_id].column_json = std::move(column_json);
+ }
+
+ void RecordArray(const ArrowSchema* values_schema, int64_t length,
+ std::string column_json) {
+ auto ids_it = dictionary_ids_.find(values_schema);
+ RecordArray(ids_it->second, length, column_json);
+ }
+
+ bool empty() { return dictionaries_.empty(); }
+
+ void clear() {
+ dictionaries_.clear();
+ dictionary_ids_.clear();
+ next_id_ = 0;
+ }
+
+ bool HasDictionaryForSchema(const ArrowSchema* values_schema) const {
+ return dictionary_ids_.find(values_schema) != dictionary_ids_.end();
+ }
+
+ bool HasDictionaryForId(int32_t dictionary_id) const {
+ return dictionaries_.find(dictionary_id) != dictionaries_.end();
+ }
+
+ const Dictionary& Get(int32_t dictionary_id) const {
+ auto dict_it = dictionaries_.find(dictionary_id);
+ return dict_it->second;
+ }
+
+ const Dictionary& Get(const ArrowSchema* values_schema) const {
+ auto ids_it = dictionary_ids_.find(values_schema);
+ return Get(ids_it->second);
+ }
+
+ const std::vector<int32_t> GetAllIds() const {
+ std::vector<int32_t> out;
+ out.reserve(dictionaries_.size());
+ for (const auto& value : dictionaries_) {
+ out.push_back(value.first);
+ }
+ return out;
+ }
+
+ private:
+ int32_t next_id_;
+ std::unordered_map<int32_t, Dictionary> dictionaries_;
+ std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_;
+};
+
+} // namespace internal
+
/// \defgroup nanoarrow_testing-json Integration test helpers
///
/// See testing format documentation for details of the JSON representation.
This
@@ -56,6 +148,8 @@ class TestingJSONWriter {
/// avoid serialization issues.
void set_float_precision(int precision) { float_precision_ = precision; }
+ void ResetDictionaries() { dictionaries_.clear(); }
+
/// \brief Write an ArrowArrayStream as a data file JSON object to out
///
/// Creates output like `{"schema": {...}, "batches": [...], ...}`.
@@ -64,6 +158,8 @@ class TestingJSONWriter {
return EINVAL;
}
+ ResetDictionaries();
+
out << R"({"schema": )";
nanoarrow::UniqueSchema schema;
@@ -93,7 +189,14 @@ class TestingJSONWriter {
array.reset();
} while (true);
- out << "]}";
+ out << "]";
+
+ if (!dictionaries_.empty()) {
+ out << R"(, "dictionaries": )";
+ NANOARROW_RETURN_NOT_OK(WriteDictionaryBatches(out));
+ }
+
+ out << "}";
return NANOARROW_OK;
}
@@ -138,7 +241,7 @@ class TestingJSONWriter {
/// Creates output like `{"name" : "col", "type": {...}, ...}`
ArrowErrorCode WriteField(std::ostream& out, const ArrowSchema* field) {
ArrowSchemaView view;
- NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field,
nullptr));
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, field, nullptr));
out << "{";
@@ -157,25 +260,37 @@ class TestingJSONWriter {
out << R"(, "nullable": false)";
}
- // Write type
- out << R"(, "type": )";
- NANOARROW_RETURN_NOT_OK(WriteType(out, &view));
+ // For dictionary encoding, write type as the dictionary (values) type,
+ // record the dictionary schema, and write the "dictionary" member
+ if (field->dictionary != nullptr) {
+ ArrowSchemaView dictionary_view;
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaViewInit(&dictionary_view, field->dictionary, nullptr));
- // Write children
- out << R"(, "children": )";
- if (field->n_children == 0) {
- out << "[]";
+ out << R"(, "type": )";
+ NANOARROW_RETURN_NOT_OK(WriteType(out, &dictionary_view));
+
+ int32_t dictionary_id;
+ NANOARROW_RETURN_NOT_OK(
+ dictionaries_.RecordSchema(field->dictionary, &dictionary_id));
+
+ out << R"(, "dictionary": )";
+ view.type = view.storage_type;
+ NANOARROW_RETURN_NOT_OK(WriteFieldDictionary(
+ out, dictionary_id, field->flags & ARROW_FLAG_DICTIONARY_ORDERED,
&view));
+
+ // Write dictionary children
+ out << R"(, "children": )";
+ NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field->dictionary));
} else {
- out << "[";
- NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0]));
- for (int64_t i = 1; i < field->n_children; i++) {
- out << ", ";
- NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i]));
- }
- out << "]";
- }
+ // Write non-dictionary type/children
+ out << R"(, "type": )";
+ NANOARROW_RETURN_NOT_OK(WriteType(out, &view));
- // TODO: Dictionary (currently fails at WriteType)
+ // Write children
+ out << R"(, "children": )";
+ NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field));
+ }
// Write metadata
if (field->metadata != nullptr) {
@@ -341,20 +456,70 @@ class TestingJSONWriter {
}
out << "}";
+
+ // Write the dictionary values to the DictionaryContext for later if
applicable
+ if (field->dictionary != nullptr) {
+ if (!dictionaries_.HasDictionaryForSchema(field->dictionary)) {
+ return EINVAL;
+ }
+
+ std::stringstream dictionary_output;
+ NANOARROW_RETURN_NOT_OK(
+ WriteColumn(dictionary_output, field->dictionary,
value->dictionary));
+ dictionaries_.RecordArray(field->dictionary, value->dictionary->length,
+ std::move(dictionary_output.str()));
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode WriteDictionaryBatches(std::ostream& out) {
+ std::vector<int32_t> ids = dictionaries_.GetAllIds();
+ if (ids.empty()) {
+ out << "[]";
+ return NANOARROW_OK;
+ }
+
+ out << "[";
+ std::sort(ids.begin(), ids.end());
+ NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[0]));
+ for (size_t i = 1; i < ids.size(); i++) {
+ out << ", ";
+ NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[i]));
+ }
+ out << "]";
+
return NANOARROW_OK;
}
private:
int float_precision_;
+ internal::DictionaryContext dictionaries_;
- ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) {
- ArrowType type;
- if (field->extension_name.data != nullptr) {
- type = field->storage_type;
+ ArrowErrorCode WriteDictionaryBatch(std::ostream& out, int32_t
dictionary_id) {
+ const internal::Dictionary& dict = dictionaries_.Get(dictionary_id);
+ out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )"
+ << dict.column_length << R"(, "columns": [)" << dict.column_json <<
"]}}";
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode WriteFieldChildren(std::ostream& out, const ArrowSchema*
field) {
+ if (field->n_children == 0) {
+ out << "[]";
} else {
- type = field->type;
+ out << "[";
+ NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0]));
+ for (int64_t i = 1; i < field->n_children; i++) {
+ out << ", ";
+ NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i]));
+ }
+ out << "]";
}
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) {
out << "{";
switch (field->type) {
@@ -447,6 +612,26 @@ class TestingJSONWriter {
return NANOARROW_OK;
}
+ ArrowErrorCode WriteFieldDictionary(std::ostream& out, int32_t dictionary_id,
+ bool is_ordered,
+ const ArrowSchemaView* indices_field) {
+ out << "{";
+
+ out << R"("id": )" << dictionary_id;
+
+ out << R"(, "indexType": )";
+ NANOARROW_RETURN_NOT_OK(WriteType(out, indices_field));
+
+ if (is_ordered) {
+ out << R"(, "isOrdered": true)";
+ } else {
+ out << R"(, "isOrdered": false)";
+ }
+
+ out << "}";
+ return NANOARROW_OK;
+ }
+
ArrowErrorCode WriteMetadataItem(std::ostream& out, ArrowMetadataReader*
reader) {
ArrowStringView key;
ArrowStringView value;
@@ -746,6 +931,8 @@ class TestingJSONReader {
ArrowErrorCode ReadDataFile(const std::string& data_file_json,
ArrowArrayStream* out,
int num_batch = kNumBatchReadAll,
ArrowError* error = nullptr) {
+ dictionaries_.clear();
+
try {
auto obj = json::parse(data_file_json);
NANOARROW_RETURN_NOT_OK(Check(obj.is_object(), error, "data file must be
object"));
@@ -767,6 +954,11 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(
ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), error));
+ // Record any dictionaries that might be present
+ if (obj.contains("dictionaries")) {
+ NANOARROW_RETURN_NOT_OK(RecordDictionaryBatches(obj["dictionaries"],
error));
+ }
+
// Get a vector of batch ids to parse
std::vector<size_t> batch_ids;
if (num_batch == kNumBatchOnlySchema) {
@@ -793,8 +985,8 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(
ArrowArrayInitFromArrayView(array.get(), array_view.get(), error));
SetArrayAllocatorRecursive(array.get());
- NANOARROW_RETURN_NOT_OK(
- SetArrayBatch(batches[batch_ids[i]], array_view.get(),
array.get(), error));
+ NANOARROW_RETURN_NOT_OK(SetArrayBatch(batches[batch_ids[i]],
schema.get(),
+ array_view.get(), array.get(),
error));
ArrowBasicArrayStreamSetArray(stream.get(), i, array.get());
}
@@ -864,7 +1056,8 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array.get(), schema,
error));
SetArrayAllocatorRecursive(array.get());
- NANOARROW_RETURN_NOT_OK(SetArrayBatch(obj, array_view.get(),
array.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ SetArrayBatch(obj, schema, array_view.get(), array.get(), error));
ArrowArrayMove(array.get(), out);
return NANOARROW_OK;
} catch (json::exception& e) {
@@ -894,7 +1087,8 @@ class TestingJSONReader {
SetArrayAllocatorRecursive(array.get());
// Parse the JSON into the array
- NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(),
array.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ SetArrayColumn(obj, schema, array_view.get(), array.get(), error));
// Return the result
ArrowArrayMove(array.get(), out);
@@ -907,6 +1101,7 @@ class TestingJSONReader {
private:
ArrowBufferAllocator allocator_;
+ internal::DictionaryContext dictionaries_;
ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError*
error) {
NANOARROW_RETURN_NOT_OK(
@@ -917,6 +1112,9 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK_WITH_ERROR(
ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRUCT), error);
+ // Top-level schema is non-nullable
+ schema->flags = 0;
+
const auto& fields = value["fields"];
NANOARROW_RETURN_NOT_OK(
Check(fields.is_array(), error, "Schema fields must be array"));
@@ -939,17 +1137,18 @@ class TestingJSONReader {
ArrowErrorCode SetField(ArrowSchema* schema, const json& value, ArrowError*
error) {
NANOARROW_RETURN_NOT_OK(
Check(value.is_object(), error, "Expected Field to be a JSON object"));
+ ArrowSchemaInit(schema);
+
NANOARROW_RETURN_NOT_OK(
Check(value.contains("name"), error, "Field missing key 'name'"));
- NANOARROW_RETURN_NOT_OK(
- Check(value.contains("nullable"), error, "Field missing key
'nullable'"));
NANOARROW_RETURN_NOT_OK(
Check(value.contains("type"), error, "Field missing key 'type'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("nullable"), error, "Field missing key
'nullable'"));
NANOARROW_RETURN_NOT_OK(
Check(value.contains("children"), error, "Field missing key
'children'"));
- ArrowSchemaInit(schema);
-
+ // Name
const auto& name = value["name"];
NANOARROW_RETURN_NOT_OK(Check(name.is_string() || name.is_null(), error,
"Field name must be string or null"));
@@ -959,6 +1158,7 @@ class TestingJSONReader {
error);
}
+ // Nullability
const auto& nullable = value["nullable"];
NANOARROW_RETURN_NOT_OK(
Check(nullable.is_boolean(), error, "Field nullable must be boolean"));
@@ -968,6 +1168,38 @@ class TestingJSONReader {
schema->flags &= ~ARROW_FLAG_NULLABLE;
}
+ // Metadata
+ if (value.contains("metadata")) {
+ NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error));
+ }
+
+ // If we have a dictionary, this value needs to be in schema->dictionary
+ // and value["dictionary"] needs to be in schema
+ if (value.contains("dictionary")) {
+ // Put the index type in this schema
+ int32_t dictionary_id;
+ NANOARROW_RETURN_NOT_OK(
+ SetDictionary(schema, value["dictionary"], &dictionary_id, error));
+
+ // Allocate a dictionary and put this value (minus dictionary, metadata,
and name)
+ json value_copy = value;
+ value_copy.erase("dictionary");
+ value_copy.erase("metadata");
+ value_copy["name"] = nullptr;
+
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaAllocateDictionary(schema),
error);
+ NANOARROW_RETURN_NOT_OK(SetField(schema->dictionary, value_copy, error));
+
+ // Keep track of this dictionary_id/schema for parsing batches
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ dictionaries_.RecordSchema(dictionary_id, schema->dictionary),
error);
+
+ // Validate!
+ ArrowSchemaView schema_view;
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema,
error));
+
+ return NANOARROW_OK;
+ }
+
NANOARROW_RETURN_NOT_OK(SetType(schema, value["type"], error));
const auto& children = value["children"];
@@ -979,16 +1211,43 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], children[i],
error));
}
- if (value.contains("metadata")) {
- NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error));
- }
-
// Validate!
ArrowSchemaView schema_view;
NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error));
return NANOARROW_OK;
}
+ ArrowErrorCode SetDictionary(ArrowSchema* schema, const json& value,
+ int32_t* dictionary_id, ArrowError* error) {
+ NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Dictionary must
be object"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("id"), error, "Dictionary missing key 'id'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("indexType"), error, "Dictionary missing key
'type'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("isOrdered"), error, "Dictionary missing key
'isOrdered'"));
+
+ const auto& id = value["id"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(id.is_number_integer(), error, "Dictionary id must be integer"));
+ *dictionary_id = id.get<int32_t>();
+
+ // Parse the index type
+ NANOARROW_RETURN_NOT_OK(SetType(schema, value["indexType"], error));
+
+ // Set the flag
+ const auto& is_ordered = value["isOrdered"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(is_ordered.is_boolean(), error, "Dictionary isOrdered must be
bool"));
+ if (is_ordered.get<bool>()) {
+ schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED;
+ } else {
+ schema->flags &= ~ARROW_FLAG_DICTIONARY_ORDERED;
+ }
+
+ return NANOARROW_OK;
+ }
+
ArrowErrorCode SetType(ArrowSchema* schema, const json& value, ArrowError*
error) {
NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Type must be
object"));
NANOARROW_RETURN_NOT_OK(
@@ -1314,8 +1573,9 @@ class TestingJSONReader {
return NANOARROW_OK;
}
- ArrowErrorCode SetArrayBatch(const json& value, ArrowArrayView* array_view,
- ArrowArray* array, ArrowError* error) {
+ ArrowErrorCode SetArrayBatch(const json& value, const ArrowSchema* schema,
+ ArrowArrayView* array_view, ArrowArray* array,
+ ArrowError* error) {
NANOARROW_RETURN_NOT_OK(
Check(value.is_object(), error, "Expected RecordBatch to be a JSON
object"));
@@ -1337,8 +1597,9 @@ class TestingJSONReader {
"RecordBatch children has incorrect size"));
for (int64_t i = 0; i < array_view->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(
- SetArrayColumn(columns[i], array_view->children[i],
array->children[i], error));
+ NANOARROW_RETURN_NOT_OK(SetArrayColumn(columns[i], schema->children[i],
+ array_view->children[i],
array->children[i],
+ error));
}
// Validate the array view
@@ -1354,8 +1615,56 @@ class TestingJSONReader {
return NANOARROW_OK;
}
- ArrowErrorCode SetArrayColumn(const json& value, ArrowArrayView* array_view,
- ArrowArray* array, ArrowError* error,
+ ArrowErrorCode RecordDictionaryBatches(const json& value, ArrowError* error)
{
+ NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "dictionaries must
be array"));
+
+ for (const auto& batch : value) {
+ NANOARROW_RETURN_NOT_OK(RecordDictionaryBatch(batch, error));
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode RecordDictionaryBatch(const json& value, ArrowError* error) {
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.is_object(), error, "dictionary batch must be object"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("id"), error, "dictionary batch missing key
'id'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("data"), error, "dictionary batch missing key
'data'"));
+
+ const auto& id = value["id"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(id.is_number_integer(), error, "dictionary batch id must be
integer"));
+ int id_int = id.get<int>();
+ NANOARROW_RETURN_NOT_OK(Check(dictionaries_.HasDictionaryForId(id_int),
error,
+ "dictionary batch has unknown id"));
+
+ const auto& batch = value["data"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(batch.is_object(), error, "dictionary batch data must be
object"));
+ NANOARROW_RETURN_NOT_OK(Check(batch.contains("columns"), error,
+ "dictionary batch missing key 'columns'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(batch.contains("count"), error, "dictionary batch missing key
'count'"));
+
+ const auto& batch_columns = batch["columns"];
+ NANOARROW_RETURN_NOT_OK(Check(batch_columns.is_array() &&
batch_columns.size() == 1,
+ error,
+ "dictionary batch columns must be array of
size 1"));
+
+ const auto& batch_count = batch["count"];
+ NANOARROW_RETURN_NOT_OK(Check(batch_count.is_number_integer(), error,
+ "dictionary batch count must be integer"));
+
+ dictionaries_.RecordArray(id_int, batch_count.get<int32_t>(),
+ batch_columns[0].dump());
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode SetArrayColumn(const json& value, const ArrowSchema* schema,
+ ArrowArrayView* array_view, ArrowArray* array,
+ ArrowError* error,
const std::string& parent_error_prefix = "") {
NANOARROW_RETURN_NOT_OK(
Check(value.is_object(), error, "Expected Column to be a JSON
object"));
@@ -1388,7 +1697,8 @@ class TestingJSONReader {
error_prefix + "children has incorrect
size"));
for (int64_t i = 0; i < array_view->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i],
array_view->children[i],
+ NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i],
schema->children[i],
+ array_view->children[i],
array->children[i], error,
error_prefix));
}
}
@@ -1416,6 +1726,28 @@ class TestingJSONReader {
ArrowBufferView* buffer_view = array_view->buffer_views + i;
buffer_view->data.as_uint8 = buffer->data;
buffer_view->size_bytes = buffer->size_bytes;
+
+ // If this is a validity buffer with a big enough size, set the
array_view's
+ // null_count
+ if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY
&&
+ _ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
+ array_view->null_count =
+ array_view->length -
+ ArrowBitCountSet(buffer_view->data.as_uint8, 0,
array_view->length);
+ }
+ }
+
+ // If there is a dictionary associated with schema, parse its value into
dictionary
+ if (schema->dictionary != nullptr) {
+ NANOARROW_RETURN_NOT_OK(Check(
+ dictionaries_.HasDictionaryForSchema(schema->dictionary), error,
+ error_prefix +
+ "dictionary could not be resolved from dictionary id in
SetArrayColumn()"));
+
+ const internal::Dictionary& dict = dictionaries_.Get(schema->dictionary);
+ NANOARROW_RETURN_NOT_OK(SetArrayColumn(
+ json::parse(dict.column_json), schema->dictionary,
array_view->dictionary,
+ array->dictionary, error, error_prefix + "-> <dictionary> "));
}
// Validate the array view
@@ -1424,9 +1756,10 @@ class TestingJSONReader {
error_prefix + "failed to validate: "));
// Flush length and buffer pointers to the Array
- array->length = array_view->length;
NANOARROW_RETURN_NOT_OK_WITH_ERROR(
ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE,
nullptr), error);
+ array->length = array_view->length;
+ array->null_count = array_view->null_count;
return NANOARROW_OK;
}
@@ -1885,6 +2218,9 @@ class TestingJSONComparison {
ArrowErrorCode CompareSchema(const ArrowSchema* actual, const ArrowSchema*
expected,
ArrowError* error = nullptr,
const std::string& path = "") {
+ writer_actual_.ResetDictionaries();
+ writer_expected_.ResetDictionaries();
+
// Compare the top-level schema "manually" because (1) map type needs
special-cased
// comparison and (2) it's easier to read the output if differences are
separated
// by field.
@@ -1926,13 +2262,13 @@ class TestingJSONComparison {
// Compare metadata
std::stringstream ss;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss,
actual->metadata),
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss,
actual->metadata),
error);
std::string actual_metadata = ss.str();
ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss,
expected->metadata),
- error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ writer_expected_.WriteMetadata(ss, expected->metadata), error);
std::string expected_metadata = ss.str();
if (actual_metadata != expected_metadata) {
@@ -1960,6 +2296,14 @@ class TestingJSONComparison {
return EINVAL;
}
+ // "Write" the schema using both writers to ensure dictionary ids can be
resolved
+ // using the ArrowSchema* pointers from schema_
+ std::stringstream ss;
+ writer_actual_.ResetDictionaries();
+ writer_expected_.ResetDictionaries();
+ writer_actual_.WriteSchema(ss, schema_.get());
+ writer_expected_.WriteSchema(ss, schema_.get());
+
return NANOARROW_OK;
}
@@ -1993,7 +2337,8 @@ class TestingJSONComparison {
}
private:
- TestingJSONWriter writer_;
+ TestingJSONWriter writer_actual_;
+ TestingJSONWriter writer_expected_;
std::vector<Difference> differences_;
nanoarrow::UniqueSchema schema_;
nanoarrow::UniqueArrayView actual_;
@@ -2019,11 +2364,11 @@ class TestingJSONComparison {
ArrowError* error, const std::string& path =
"") {
std::stringstream ss;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, expected),
error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss,
expected), error);
std::string expected_json = ss.str();
ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, actual), error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual),
error);
std::string actual_json = ss.str();
if (actual_json != expected_json) {
@@ -2036,13 +2381,31 @@ class TestingJSONComparison {
ArrowErrorCode CompareColumn(ArrowSchema* schema, ArrowArrayView* actual,
ArrowArrayView* expected, ArrowError* error,
const std::string& path = "") {
- std::stringstream ss;
+ // Compare children and dictionaries first, then higher-level structures
after.
+ // This is a redundant because the higher-level serialized JSON will also
report
+ // a difference if deeply nested children have differences; however, it
will not
+ // contain dictionaries and this output is slightly better (more targeted
differences
+ // that are slightly easier to read appear first).
+ for (int64_t i = 0; i < schema->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(
+ CompareColumn(schema->children[i], actual->children[i],
expected->children[i],
+ error, path + ".children[" + std::to_string(i) + "]"));
+ }
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema,
expected), error);
+ if (schema->dictionary != nullptr) {
+ NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary,
actual->dictionary,
+ expected->dictionary, error,
+ path + ".dictionary"));
+ }
+
+ std::stringstream ss;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss,
schema, expected),
+ error);
std::string expected_json = ss.str();
ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema,
actual), error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema,
actual),
+ error);
std::string actual_json = ss.str();
if (actual_json != expected_json) {
diff --git a/src/nanoarrow/nanoarrow_testing_test.cc
b/src/nanoarrow/nanoarrow_testing_test.cc
index 217bf9d..248ec69 100644
--- a/src/nanoarrow/nanoarrow_testing_test.cc
+++ b/src/nanoarrow/nanoarrow_testing_test.cc
@@ -406,6 +406,21 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldBasic)
{
R"({"name": "colname", "nullable": true, "type": {"name": "null"},
"children": []})");
}
+TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDict) {
+ TestWriteJSON(
+ [](ArrowSchema* schema) {
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema,
NANOARROW_TYPE_INT16));
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(schema));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaInitFromType(schema->dictionary,
NANOARROW_TYPE_STRING));
+ return NANOARROW_OK;
+ },
+ [](ArrowArray* array) { return NANOARROW_OK; }, &WriteFieldJSON,
+ R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+ R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 16,
"isSigned": true}, )"
+ R"("isOrdered": false}, "children": []})");
+}
+
TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldMetadata) {
// Missing metadata
TestWriteJSON(
@@ -805,6 +820,37 @@ TEST(NanoarrowTestingTest,
NanoarrowTestingTestReadFieldNested) {
EXPECT_STREQ(schema->children[0]->format, "n");
}
+TEST(NanoarrowTestingTest, NanoarrowTestingTestReadFieldDictionary) {
+ nanoarrow::UniqueSchema schema;
+ TestingJSONReader reader;
+
+ // Unordered
+ ASSERT_EQ(
+ reader.ReadField(
+ R"({"name": "col1", "nullable": true, "type": {"name": "utf8"},
"children": [], )"
+ R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth":
32, "isSigned": true}, "isOrdered": false}})",
+ schema.get()),
+ NANOARROW_OK);
+ EXPECT_STREQ(schema->format, "i");
+ EXPECT_STREQ(schema->name, "col1");
+ EXPECT_TRUE(schema->flags & ARROW_FLAG_NULLABLE);
+ EXPECT_FALSE(schema->flags & ARROW_FLAG_DICTIONARY_ORDERED);
+ ASSERT_NE(schema->dictionary, nullptr);
+ EXPECT_STREQ(schema->dictionary->format, "u");
+ EXPECT_EQ(schema->dictionary->name, nullptr);
+ EXPECT_EQ(schema->dictionary->dictionary, nullptr);
+
+ // Ordered
+ schema.reset();
+ ASSERT_EQ(
+ reader.ReadField(
+ R"({"name": "col1", "nullable": true, "type": {"name": "utf8"},
"children": [], )"
+ R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth":
32, "isSigned": true}, "isOrdered": true}})",
+ schema.get()),
+ NANOARROW_OK);
+ EXPECT_TRUE(schema->flags & ARROW_FLAG_DICTIONARY_ORDERED);
+}
+
TEST(NanoarrowTestingTest, NanoarrowTestingTestRoundtripDataFile) {
nanoarrow::UniqueArrayStream stream;
ArrowError error;
@@ -850,6 +896,33 @@ TEST(NanoarrowTestingTest,
NanoarrowTestingTestRoundtripDataFile) {
ASSERT_EQ(reader.ReadDataFile("{", stream.get()), EINVAL);
}
+TEST(NanoarrowTestingTest, NanoarrowTestingTestRoundtripDataFileDictionary) {
+ nanoarrow::UniqueArrayStream stream;
+ ArrowError error;
+ error.message[0] = '\0';
+
+ std::string data_file_json =
+ R"({"schema": {"fields": [{"name": null, "nullable": true, "type":
{"name": "binary"}, )"
+ R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 32,
"isSigned": true}, "isOrdered": false}, "children": []}, )"
+ R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+ R"("dictionary": {"id": 1, "indexType": {"name": "int", "bitWidth": 8,
"isSigned": true}, "isOrdered": false}, "children": []}]}, )"
+ R"("batches": [{"count": 1, "columns": [{"name": null, "count": 1,
"VALIDITY": [1], "DATA": [0]}, )"
+ R"({"name": null, "count": 1, "VALIDITY": [1], "DATA": [1]}]}], )"
+ R"("dictionaries": [{"id": 0, "data": {"count": 1, "columns": [{"name":
null, "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA": ["616263"]}]}}, )"
+ R"({"id": 1, "data": {"count": 2, "columns": [{"name": null, "count": 2,
"VALIDITY": [1, 1], "OFFSET": [0, 3, 6], "DATA": ["abc", "def"]}]}}]})";
+
+ TestingJSONReader reader;
+ ASSERT_EQ(reader.ReadDataFile(data_file_json, stream.get(),
+ TestingJSONReader::kNumBatchReadAll, &error),
+ NANOARROW_OK)
+ << error.message;
+
+ TestingJSONWriter writer;
+ std::stringstream data_file_json_roundtrip;
+ ASSERT_EQ(writer.WriteDataFile(data_file_json_roundtrip, stream.get()),
NANOARROW_OK);
+ EXPECT_EQ(data_file_json_roundtrip.str(), data_file_json);
+}
+
TEST(NanoarrowTestingTest, NanoarrowTestingTestReadBatch) {
nanoarrow::UniqueSchema schema;
nanoarrow::UniqueArray array;
@@ -1178,6 +1251,20 @@ TEST(NanoarrowTestingTest,
NanoarrowTestingTestFieldUnion) {
"Type[name=='union'] mode must be 'DENSE' or 'SPARSE'");
}
+TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDictionaryRoundtrip) {
+ // Unordered
+ TestFieldRoundtrip(
+ R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+ R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 16,
"isSigned": true}, )"
+ R"("isOrdered": false}, "children": []})");
+
+ // Ordered
+ TestFieldRoundtrip(
+ R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+ R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 16,
"isSigned": true}, )"
+ R"("isOrdered": true}, "children": []})");
+}
+
void AssertSchemasCompareEqual(ArrowSchema* actual, ArrowSchema* expected) {
TestingJSONComparison comparison;
std::stringstream msg;
@@ -1346,6 +1433,61 @@ TEST(NanoarrowTestingTest,
NanoarrowTestingTestArrayComparison) {
)");
}
+TEST(NanoarrowTestingTest, NanoarrowTestingTestArrayWithDictionaryComparison) {
+ nanoarrow::UniqueSchema schema;
+ nanoarrow::UniqueArray actual;
+ nanoarrow::UniqueArray expected;
+
+ TestingJSONComparison comparison;
+ std::stringstream msg;
+
+ ArrowSchemaInit(schema.get());
+ ASSERT_EQ(ArrowSchemaSetTypeStruct(schema.get(), 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema->children[0], NANOARROW_TYPE_INT32),
NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaAllocateDictionary(schema->children[0]), NANOARROW_OK);
+ ASSERT_EQ(
+ ArrowSchemaInitFromType(schema->children[0]->dictionary,
NANOARROW_TYPE_STRING),
+ NANOARROW_OK);
+ ASSERT_EQ(comparison.SetSchema(schema.get()), NANOARROW_OK);
+
+ // Dictionary-encoded with one element
+ ASSERT_EQ(ArrowArrayInitFromSchema(expected.get(), schema.get(), nullptr),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayStartAppending(expected.get()), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(expected->children[0], 0), NANOARROW_OK);
+ ASSERT_EQ(
+ ArrowArrayAppendString(expected->children[0]->dictionary,
ArrowCharView("abc")),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishElement(expected.get()), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishBuildingDefault(expected.get(), nullptr),
NANOARROW_OK);
+
+ // Dictionary-encoded with one element with the only difference in the
dictionary
+ ASSERT_EQ(ArrowArrayInitFromSchema(actual.get(), schema.get(), nullptr),
NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayStartAppending(actual.get()), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(actual->children[0], 0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(actual->children[0]->dictionary,
ArrowCharView("def")),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishElement(actual.get()), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishBuildingDefault(actual.get(), nullptr),
NANOARROW_OK);
+
+ // Compare array with dictionary that has no differences
+ ASSERT_EQ(comparison.CompareBatch(actual.get(), actual.get()), NANOARROW_OK);
+ EXPECT_EQ(comparison.num_differences(), 0);
+ comparison.ClearDifferences();
+
+ // Compare arrays with nested difference in the dictionary
+ ArrowError error;
+ ASSERT_EQ(comparison.CompareBatch(actual.get(), expected.get(), &error),
NANOARROW_OK)
+ << error.message;
+ EXPECT_EQ(comparison.num_differences(), 1);
+ comparison.WriteDifferences(msg);
+ EXPECT_EQ(msg.str(), R"(Path: .children[0].dictionary
+- {"name": null, "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA":
["def"]}
++ {"name": null, "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA":
["abc"]}
+
+)");
+}
+
ArrowErrorCode MakeArrayStream(const ArrowSchema* schema,
std::vector<std::string> batches_json,
ArrowArrayStream* out) {