(arrow-nanoarrow) branch main updated: feat: Add dictionary support in integration test utility (#342)

paleolimbot Tue, 09 Jan 2024 07:38:00 -0800

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 6523f51  feat: Add dictionary support in integration test utility 
(#342)
6523f51 is described below

commit 6523f511799537ac1dc3b210d8e804caff02edd9
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Jan 9 15:37:49 2024 +0000

    feat: Add dictionary support in integration test utility (#342)
    
    This PR implements dictionary support in the integration test utility
    and fixes a few problems identified with integration testing to ensure
    that it actually works end-to-end (via
    https://github.com/apache/arrow/pull/39302 ). The changes are:
    
    - Batches that contain dictionaries can now be read, written, and
    validated using integration testing JSON
    - Fixed an issue in the integration test library (anything other than
    the first batch previously segfaulted)
    - Improved const correctness of nanoarrow.hpp (because dictionaries
    required a `std::unordered_map<>` with a `UniqueSchema` and a few const
    overloads were missing)
    - Fixed the nullability of the top-level batch to match Arrow C++ output
    - Fixed the null count of exported arrays (previously they were all
    exported as having zero nulls)
    
    It can now be tested with `archery` (after checking out
    https://github.com/apache/arrow/pull/39302 ):
    
    ```
    export 
ARROW_CPP_EXE_PATH=/Users/deweydunnington/.r-arrow-dev-build/build/debug
    export ARROW_NANOARROW_PATH=/path/to/arrow-nanoarrow/build
    archery integration --with-cpp=true --with-nanoarrow=true --run-c-data
    ```
    
    The current failures are limited to the remaining unimplemented types
    (datetime types and decimal).
    
    And for future me or anybody who has to/wants to launch a debugger with
    a segfaulting integration test in VSCode, it can be done with this
    launch.json:
    
    ```
    {
        "type": "lldb",
        "request": "launch",
        "name": "Debug Integration Tests",
        "program": "${workspaceFolder}/.venv/bin/python",
        "args": ["-m", "archery.cli", "integration", "--with-cpp=true",  
"--with-nanoarrow=true", "--run-c-data"],
        "cwd": "${workspaceFolder}",
        "env": {
            "ARROW_CPP_EXE_PATH": 
"/Users/deweydunnington/.r-arrow-dev-build/build/debug",
            "ARROW_NANOARROW_PATH": "${workspaceFolder}/out/build/user-local"
        }
    }
    ```
---
 .../src/nanoarrow/nanoarrow_ipc_decoder.c          |   3 +
 src/nanoarrow/array.c                              |   4 +-
 src/nanoarrow/array_test.cc                        |   3 +-
 src/nanoarrow/integration/c_data_integration.cc    |   4 +-
 src/nanoarrow/nanoarrow.hpp                        |  10 +-
 src/nanoarrow/nanoarrow_testing.hpp                | 467 ++++++++++++++++++---
 src/nanoarrow/nanoarrow_testing_test.cc            | 142 +++++++
 7 files changed, 573 insertions(+), 60 deletions(-)

diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c 
b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
index 46b6807..8a5a111 100644
--- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
+++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c
@@ -1123,6 +1123,9 @@ ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct 
ArrowIpcDecoder* decoder,
     return result;
   }
 
+  // Top-level batch schema is typically non-nullable
+  tmp.flags = 0;
+
   result = ArrowIpcDecoderSetChildren(&tmp, fields, error);
   if (result != NANOARROW_OK) {
     ArrowSchemaRelease(&tmp);
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 3f24ccb..e790740 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -1164,8 +1164,8 @@ static int ArrowArrayViewValidateFull(struct 
ArrowArrayView* array_view,
 
   // Dictionary valiation not implemented
   if (array_view->dictionary != NULL) {
-    ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not 
implemented");
-    return ENOTSUP;
+    NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, 
error));
+    // TODO: validate the indices
   }
 
   return NANOARROW_OK;
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index 0101608..739e1d8 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -2121,9 +2121,8 @@ TEST(ArrayTest, ArrayViewTestDictionary) {
   EXPECT_EQ(array_view.buffer_views[1].size_bytes, 2 * sizeof(int32_t));
   EXPECT_EQ(array_view.dictionary->buffer_views[2].size_bytes, 6);
 
-  // Full validation not yet supported for dictionary
   EXPECT_EQ(ArrowArrayViewValidate(&array_view, 
NANOARROW_VALIDATION_LEVEL_FULL, nullptr),
-            ENOTSUP);
+            NANOARROW_OK);
 
   EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 0), 0);
   EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 1), 1);
diff --git a/src/nanoarrow/integration/c_data_integration.cc 
b/src/nanoarrow/integration/c_data_integration.cc
index 14a92b3..6ab09ea 100644
--- a/src/nanoarrow/integration/c_data_integration.cc
+++ b/src/nanoarrow/integration/c_data_integration.cc
@@ -159,7 +159,7 @@ static ArrowErrorCode ExportBatchFromJson(const char* 
json_path, int num_batch,
   MaterializedArrayStream data;
   NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(json_path, &data, num_batch, 
error));
 
-  ArrowArrayMove(data.arrays[num_batch].get(), out);
+  ArrowArrayMove(data.arrays[0].get(), out);
   return NANOARROW_OK;
 }
 
@@ -173,7 +173,7 @@ static ArrowErrorCode ImportBatchAndCompareToJson(const 
char* json_path, int num
   nanoarrow::testing::TestingJSONComparison comparison;
   NANOARROW_RETURN_NOT_OK(comparison.SetSchema(data.schema.get(), error));
   NANOARROW_RETURN_NOT_OK(
-      comparison.CompareBatch(actual.get(), data.arrays[num_batch].get(), 
error));
+      comparison.CompareBatch(actual.get(), data.arrays[0].get(), error));
   if (comparison.num_differences() > 0) {
     std::stringstream ss;
     comparison.WriteDifferences(ss);
diff --git a/src/nanoarrow/nanoarrow.hpp b/src/nanoarrow/nanoarrow.hpp
index 15914ce..8d5b841 100644
--- a/src/nanoarrow/nanoarrow.hpp
+++ b/src/nanoarrow/nanoarrow.hpp
@@ -206,15 +206,21 @@ class Unique {
 
   /// \brief Move and take ownership of data wrapped by rhs
   Unique(Unique&& rhs) : Unique(rhs.get()) {}
+  Unique& operator=(Unique&& rhs) {
+    reset(rhs.get());
+    return *this;
+  }
 
   // These objects are not copyable
-  Unique(Unique& rhs) = delete;
+  Unique(const Unique& rhs) = delete;
 
   /// \brief Get a pointer to the data owned by this object
   T* get() noexcept { return &data_; }
+  const T* get() const noexcept { return &data_; }
 
   /// \brief Use the pointer operator to access fields of this object
-  T* operator->() { return &data_; }
+  T* operator->() noexcept { return &data_; }
+  const T* operator->() const noexcept { return &data_; }
 
   /// \brief Call data's release callback if valid
   void reset() { release_pointer(&data_); }
diff --git a/src/nanoarrow/nanoarrow_testing.hpp 
b/src/nanoarrow/nanoarrow_testing.hpp
index b62c3ca..951c215 100644
--- a/src/nanoarrow/nanoarrow_testing.hpp
+++ b/src/nanoarrow/nanoarrow_testing.hpp
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
 #include <iostream>
 #include <limits>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 
 #include <nlohmann/json.hpp>
 
@@ -35,6 +37,96 @@ namespace nanoarrow {
 
 namespace testing {
 
+namespace internal {
+
+// Internal representation of the various structures needed to import and/or 
export
+// a dictionary array. We use a serialized version of the dictionary value 
because
+// nanoarrow doesn't currently have the ability to copy or reference count an 
Array.
+struct Dictionary {
+  nanoarrow::UniqueSchema schema;
+  int64_t column_length;
+  std::string column_json;
+};
+
+class DictionaryContext {
+ public:
+  DictionaryContext() : next_id_(0) {}
+
+  ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema* 
values_schema) {
+    if (!HasDictionaryForId(dictionary_id)) {
+      dictionaries_[dictionary_id] = internal::Dictionary();
+      NANOARROW_RETURN_NOT_OK(
+          ArrowSchemaDeepCopy(values_schema, 
dictionaries_[dictionary_id].schema.get()));
+    }
+
+    dictionary_ids_[values_schema] = dictionary_id;
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t* 
dictionary_id) {
+    while (HasDictionaryForId(next_id_)) {
+      next_id_++;
+    }
+
+    NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema));
+    *dictionary_id = next_id_++;
+    return NANOARROW_OK;
+  }
+
+  void RecordArray(int32_t dictionary_id, int64_t length, std::string 
column_json) {
+    dictionaries_[dictionary_id].column_length = length;
+    dictionaries_[dictionary_id].column_json = std::move(column_json);
+  }
+
+  void RecordArray(const ArrowSchema* values_schema, int64_t length,
+                   std::string column_json) {
+    auto ids_it = dictionary_ids_.find(values_schema);
+    RecordArray(ids_it->second, length, column_json);
+  }
+
+  bool empty() { return dictionaries_.empty(); }
+
+  void clear() {
+    dictionaries_.clear();
+    dictionary_ids_.clear();
+    next_id_ = 0;
+  }
+
+  bool HasDictionaryForSchema(const ArrowSchema* values_schema) const {
+    return dictionary_ids_.find(values_schema) != dictionary_ids_.end();
+  }
+
+  bool HasDictionaryForId(int32_t dictionary_id) const {
+    return dictionaries_.find(dictionary_id) != dictionaries_.end();
+  }
+
+  const Dictionary& Get(int32_t dictionary_id) const {
+    auto dict_it = dictionaries_.find(dictionary_id);
+    return dict_it->second;
+  }
+
+  const Dictionary& Get(const ArrowSchema* values_schema) const {
+    auto ids_it = dictionary_ids_.find(values_schema);
+    return Get(ids_it->second);
+  }
+
+  const std::vector<int32_t> GetAllIds() const {
+    std::vector<int32_t> out;
+    out.reserve(dictionaries_.size());
+    for (const auto& value : dictionaries_) {
+      out.push_back(value.first);
+    }
+    return out;
+  }
+
+ private:
+  int32_t next_id_;
+  std::unordered_map<int32_t, Dictionary> dictionaries_;
+  std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_;
+};
+
+}  // namespace internal
+
 /// \defgroup nanoarrow_testing-json Integration test helpers
 ///
 /// See testing format documentation for details of the JSON representation. 
This
@@ -56,6 +148,8 @@ class TestingJSONWriter {
   /// avoid serialization issues.
   void set_float_precision(int precision) { float_precision_ = precision; }
 
+  void ResetDictionaries() { dictionaries_.clear(); }
+
   /// \brief Write an ArrowArrayStream as a data file JSON object to out
   ///
   /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
@@ -64,6 +158,8 @@ class TestingJSONWriter {
       return EINVAL;
     }
 
+    ResetDictionaries();
+
     out << R"({"schema": )";
 
     nanoarrow::UniqueSchema schema;
@@ -93,7 +189,14 @@ class TestingJSONWriter {
       array.reset();
     } while (true);
 
-    out << "]}";
+    out << "]";
+
+    if (!dictionaries_.empty()) {
+      out << R"(, "dictionaries": )";
+      NANOARROW_RETURN_NOT_OK(WriteDictionaryBatches(out));
+    }
+
+    out << "}";
 
     return NANOARROW_OK;
   }
@@ -138,7 +241,7 @@ class TestingJSONWriter {
   /// Creates output like `{"name" : "col", "type": {...}, ...}`
   ArrowErrorCode WriteField(std::ostream& out, const ArrowSchema* field) {
     ArrowSchemaView view;
-    NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field, 
nullptr));
+    NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, field, nullptr));
 
     out << "{";
 
@@ -157,25 +260,37 @@ class TestingJSONWriter {
       out << R"(, "nullable": false)";
     }
 
-    // Write type
-    out << R"(, "type": )";
-    NANOARROW_RETURN_NOT_OK(WriteType(out, &view));
+    // For dictionary encoding, write type as the dictionary (values) type,
+    // record the dictionary schema, and write the "dictionary" member
+    if (field->dictionary != nullptr) {
+      ArrowSchemaView dictionary_view;
+      NANOARROW_RETURN_NOT_OK(
+          ArrowSchemaViewInit(&dictionary_view, field->dictionary, nullptr));
 
-    // Write children
-    out << R"(, "children": )";
-    if (field->n_children == 0) {
-      out << "[]";
+      out << R"(, "type": )";
+      NANOARROW_RETURN_NOT_OK(WriteType(out, &dictionary_view));
+
+      int32_t dictionary_id;
+      NANOARROW_RETURN_NOT_OK(
+          dictionaries_.RecordSchema(field->dictionary, &dictionary_id));
+
+      out << R"(, "dictionary": )";
+      view.type = view.storage_type;
+      NANOARROW_RETURN_NOT_OK(WriteFieldDictionary(
+          out, dictionary_id, field->flags & ARROW_FLAG_DICTIONARY_ORDERED, 
&view));
+
+      // Write dictionary children
+      out << R"(, "children": )";
+      NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field->dictionary));
     } else {
-      out << "[";
-      NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0]));
-      for (int64_t i = 1; i < field->n_children; i++) {
-        out << ", ";
-        NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i]));
-      }
-      out << "]";
-    }
+      // Write non-dictionary type/children
+      out << R"(, "type": )";
+      NANOARROW_RETURN_NOT_OK(WriteType(out, &view));
 
-    // TODO: Dictionary (currently fails at WriteType)
+      // Write children
+      out << R"(, "children": )";
+      NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field));
+    }
 
     // Write metadata
     if (field->metadata != nullptr) {
@@ -341,20 +456,70 @@ class TestingJSONWriter {
     }
 
     out << "}";
+
+    // Write the dictionary values to the DictionaryContext for later if 
applicable
+    if (field->dictionary != nullptr) {
+      if (!dictionaries_.HasDictionaryForSchema(field->dictionary)) {
+        return EINVAL;
+      }
+
+      std::stringstream dictionary_output;
+      NANOARROW_RETURN_NOT_OK(
+          WriteColumn(dictionary_output, field->dictionary, 
value->dictionary));
+      dictionaries_.RecordArray(field->dictionary, value->dictionary->length,
+                                std::move(dictionary_output.str()));
+    }
+
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode WriteDictionaryBatches(std::ostream& out) {
+    std::vector<int32_t> ids = dictionaries_.GetAllIds();
+    if (ids.empty()) {
+      out << "[]";
+      return NANOARROW_OK;
+    }
+
+    out << "[";
+    std::sort(ids.begin(), ids.end());
+    NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[0]));
+    for (size_t i = 1; i < ids.size(); i++) {
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[i]));
+    }
+    out << "]";
+
     return NANOARROW_OK;
   }
 
  private:
   int float_precision_;
+  internal::DictionaryContext dictionaries_;
 
-  ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) {
-    ArrowType type;
-    if (field->extension_name.data != nullptr) {
-      type = field->storage_type;
+  ArrowErrorCode WriteDictionaryBatch(std::ostream& out, int32_t 
dictionary_id) {
+    const internal::Dictionary& dict = dictionaries_.Get(dictionary_id);
+    out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )"
+        << dict.column_length << R"(, "columns": [)" << dict.column_json << 
"]}}";
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode WriteFieldChildren(std::ostream& out, const ArrowSchema* 
field) {
+    if (field->n_children == 0) {
+      out << "[]";
     } else {
-      type = field->type;
+      out << "[";
+      NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0]));
+      for (int64_t i = 1; i < field->n_children; i++) {
+        out << ", ";
+        NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i]));
+      }
+      out << "]";
     }
 
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) {
     out << "{";
 
     switch (field->type) {
@@ -447,6 +612,26 @@ class TestingJSONWriter {
     return NANOARROW_OK;
   }
 
+  ArrowErrorCode WriteFieldDictionary(std::ostream& out, int32_t dictionary_id,
+                                      bool is_ordered,
+                                      const ArrowSchemaView* indices_field) {
+    out << "{";
+
+    out << R"("id": )" << dictionary_id;
+
+    out << R"(, "indexType": )";
+    NANOARROW_RETURN_NOT_OK(WriteType(out, indices_field));
+
+    if (is_ordered) {
+      out << R"(, "isOrdered": true)";
+    } else {
+      out << R"(, "isOrdered": false)";
+    }
+
+    out << "}";
+    return NANOARROW_OK;
+  }
+
   ArrowErrorCode WriteMetadataItem(std::ostream& out, ArrowMetadataReader* 
reader) {
     ArrowStringView key;
     ArrowStringView value;
@@ -746,6 +931,8 @@ class TestingJSONReader {
   ArrowErrorCode ReadDataFile(const std::string& data_file_json, 
ArrowArrayStream* out,
                               int num_batch = kNumBatchReadAll,
                               ArrowError* error = nullptr) {
+    dictionaries_.clear();
+
     try {
       auto obj = json::parse(data_file_json);
       NANOARROW_RETURN_NOT_OK(Check(obj.is_object(), error, "data file must be 
object"));
@@ -767,6 +954,11 @@ class TestingJSONReader {
       NANOARROW_RETURN_NOT_OK(
           ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), error));
 
+      // Record any dictionaries that might be present
+      if (obj.contains("dictionaries")) {
+        NANOARROW_RETURN_NOT_OK(RecordDictionaryBatches(obj["dictionaries"], 
error));
+      }
+
       // Get a vector of batch ids to parse
       std::vector<size_t> batch_ids;
       if (num_batch == kNumBatchOnlySchema) {
@@ -793,8 +985,8 @@ class TestingJSONReader {
         NANOARROW_RETURN_NOT_OK(
             ArrowArrayInitFromArrayView(array.get(), array_view.get(), error));
         SetArrayAllocatorRecursive(array.get());
-        NANOARROW_RETURN_NOT_OK(
-            SetArrayBatch(batches[batch_ids[i]], array_view.get(), 
array.get(), error));
+        NANOARROW_RETURN_NOT_OK(SetArrayBatch(batches[batch_ids[i]], 
schema.get(),
+                                              array_view.get(), array.get(), 
error));
         ArrowBasicArrayStreamSetArray(stream.get(), i, array.get());
       }
 
@@ -864,7 +1056,8 @@ class TestingJSONReader {
       NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array.get(), schema, 
error));
       SetArrayAllocatorRecursive(array.get());
 
-      NANOARROW_RETURN_NOT_OK(SetArrayBatch(obj, array_view.get(), 
array.get(), error));
+      NANOARROW_RETURN_NOT_OK(
+          SetArrayBatch(obj, schema, array_view.get(), array.get(), error));
       ArrowArrayMove(array.get(), out);
       return NANOARROW_OK;
     } catch (json::exception& e) {
@@ -894,7 +1087,8 @@ class TestingJSONReader {
       SetArrayAllocatorRecursive(array.get());
 
       // Parse the JSON into the array
-      NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(), 
array.get(), error));
+      NANOARROW_RETURN_NOT_OK(
+          SetArrayColumn(obj, schema, array_view.get(), array.get(), error));
 
       // Return the result
       ArrowArrayMove(array.get(), out);
@@ -907,6 +1101,7 @@ class TestingJSONReader {
 
  private:
   ArrowBufferAllocator allocator_;
+  internal::DictionaryContext dictionaries_;
 
   ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* 
error) {
     NANOARROW_RETURN_NOT_OK(
@@ -917,6 +1112,9 @@ class TestingJSONReader {
     NANOARROW_RETURN_NOT_OK_WITH_ERROR(
         ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRUCT), error);
 
+    // Top-level schema is non-nullable
+    schema->flags = 0;
+
     const auto& fields = value["fields"];
     NANOARROW_RETURN_NOT_OK(
         Check(fields.is_array(), error, "Schema fields must be array"));
@@ -939,17 +1137,18 @@ class TestingJSONReader {
   ArrowErrorCode SetField(ArrowSchema* schema, const json& value, ArrowError* 
error) {
     NANOARROW_RETURN_NOT_OK(
         Check(value.is_object(), error, "Expected Field to be a JSON object"));
+    ArrowSchemaInit(schema);
+
     NANOARROW_RETURN_NOT_OK(
         Check(value.contains("name"), error, "Field missing key 'name'"));
-    NANOARROW_RETURN_NOT_OK(
-        Check(value.contains("nullable"), error, "Field missing key 
'nullable'"));
     NANOARROW_RETURN_NOT_OK(
         Check(value.contains("type"), error, "Field missing key 'type'"));
+    NANOARROW_RETURN_NOT_OK(
+        Check(value.contains("nullable"), error, "Field missing key 
'nullable'"));
     NANOARROW_RETURN_NOT_OK(
         Check(value.contains("children"), error, "Field missing key 
'children'"));
 
-    ArrowSchemaInit(schema);
-
+    // Name
     const auto& name = value["name"];
     NANOARROW_RETURN_NOT_OK(Check(name.is_string() || name.is_null(), error,
                                   "Field name must be string or null"));
@@ -959,6 +1158,7 @@ class TestingJSONReader {
                                          error);
     }
 
+    // Nullability
     const auto& nullable = value["nullable"];
     NANOARROW_RETURN_NOT_OK(
         Check(nullable.is_boolean(), error, "Field nullable must be boolean"));
@@ -968,6 +1168,38 @@ class TestingJSONReader {
       schema->flags &= ~ARROW_FLAG_NULLABLE;
     }
 
+    // Metadata
+    if (value.contains("metadata")) {
+      NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error));
+    }
+
+    // If we have a dictionary, this value needs to be in schema->dictionary
+    // and value["dictionary"] needs to be in schema
+    if (value.contains("dictionary")) {
+      // Put the index type in this schema
+      int32_t dictionary_id;
+      NANOARROW_RETURN_NOT_OK(
+          SetDictionary(schema, value["dictionary"], &dictionary_id, error));
+
+      // Allocate a dictionary and put this value (minus dictionary, metadata, 
and name)
+      json value_copy = value;
+      value_copy.erase("dictionary");
+      value_copy.erase("metadata");
+      value_copy["name"] = nullptr;
+      
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaAllocateDictionary(schema), 
error);
+      NANOARROW_RETURN_NOT_OK(SetField(schema->dictionary, value_copy, error));
+
+      // Keep track of this dictionary_id/schema for parsing batches
+      NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+          dictionaries_.RecordSchema(dictionary_id, schema->dictionary), 
error);
+
+      // Validate!
+      ArrowSchemaView schema_view;
+      NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, 
error));
+
+      return NANOARROW_OK;
+    }
+
     NANOARROW_RETURN_NOT_OK(SetType(schema, value["type"], error));
 
     const auto& children = value["children"];
@@ -979,16 +1211,43 @@ class TestingJSONReader {
       NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], children[i], 
error));
     }
 
-    if (value.contains("metadata")) {
-      NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error));
-    }
-
     // Validate!
     ArrowSchemaView schema_view;
     NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error));
     return NANOARROW_OK;
   }
 
+  ArrowErrorCode SetDictionary(ArrowSchema* schema, const json& value,
+                               int32_t* dictionary_id, ArrowError* error) {
+    NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Dictionary must 
be object"));
+    NANOARROW_RETURN_NOT_OK(
+        Check(value.contains("id"), error, "Dictionary missing key 'id'"));
+    NANOARROW_RETURN_NOT_OK(
+        Check(value.contains("indexType"), error, "Dictionary missing key 
'type'"));
+    NANOARROW_RETURN_NOT_OK(
+        Check(value.contains("isOrdered"), error, "Dictionary missing key 
'isOrdered'"));
+
+    const auto& id = value["id"];
+    NANOARROW_RETURN_NOT_OK(
+        Check(id.is_number_integer(), error, "Dictionary id must be integer"));
+    *dictionary_id = id.get<int32_t>();
+
+    // Parse the index type
+    NANOARROW_RETURN_NOT_OK(SetType(schema, value["indexType"], error));
+
+    // Set the flag
+    const auto& is_ordered = value["isOrdered"];
+    NANOARROW_RETURN_NOT_OK(
+        Check(is_ordered.is_boolean(), error, "Dictionary isOrdered must be 
bool"));
+    if (is_ordered.get<bool>()) {
+      schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED;
+    } else {
+      schema->flags &= ~ARROW_FLAG_DICTIONARY_ORDERED;
+    }
+
+    return NANOARROW_OK;
+  }
+
   ArrowErrorCode SetType(ArrowSchema* schema, const json& value, ArrowError* 
error) {
     NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Type must be 
object"));
     NANOARROW_RETURN_NOT_OK(
@@ -1314,8 +1573,9 @@ class TestingJSONReader {
     return NANOARROW_OK;
   }
 
-  ArrowErrorCode SetArrayBatch(const json& value, ArrowArrayView* array_view,
-                               ArrowArray* array, ArrowError* error) {
+  ArrowErrorCode SetArrayBatch(const json& value, const ArrowSchema* schema,
+                               ArrowArrayView* array_view, ArrowArray* array,
+                               ArrowError* error) {
     NANOARROW_RETURN_NOT_OK(
         Check(value.is_object(), error, "Expected RecordBatch to be a JSON 
object"));
 
@@ -1337,8 +1597,9 @@ class TestingJSONReader {
                                   "RecordBatch children has incorrect size"));
 
     for (int64_t i = 0; i < array_view->n_children; i++) {
-      NANOARROW_RETURN_NOT_OK(
-          SetArrayColumn(columns[i], array_view->children[i], 
array->children[i], error));
+      NANOARROW_RETURN_NOT_OK(SetArrayColumn(columns[i], schema->children[i],
+                                             array_view->children[i], 
array->children[i],
+                                             error));
     }
 
     // Validate the array view
@@ -1354,8 +1615,56 @@ class TestingJSONReader {
     return NANOARROW_OK;
   }
 
-  ArrowErrorCode SetArrayColumn(const json& value, ArrowArrayView* array_view,
-                                ArrowArray* array, ArrowError* error,
+  ArrowErrorCode RecordDictionaryBatches(const json& value, ArrowError* error) 
{
+    NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "dictionaries must 
be array"));
+
+    for (const auto& batch : value) {
+      NANOARROW_RETURN_NOT_OK(RecordDictionaryBatch(batch, error));
+    }
+
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode RecordDictionaryBatch(const json& value, ArrowError* error) {
+    NANOARROW_RETURN_NOT_OK(
+        Check(value.is_object(), error, "dictionary batch must be object"));
+    NANOARROW_RETURN_NOT_OK(
+        Check(value.contains("id"), error, "dictionary batch missing key 
'id'"));
+    NANOARROW_RETURN_NOT_OK(
+        Check(value.contains("data"), error, "dictionary batch missing key 
'data'"));
+
+    const auto& id = value["id"];
+    NANOARROW_RETURN_NOT_OK(
+        Check(id.is_number_integer(), error, "dictionary batch id must be 
integer"));
+    int id_int = id.get<int>();
+    NANOARROW_RETURN_NOT_OK(Check(dictionaries_.HasDictionaryForId(id_int), 
error,
+                                  "dictionary batch has unknown id"));
+
+    const auto& batch = value["data"];
+    NANOARROW_RETURN_NOT_OK(
+        Check(batch.is_object(), error, "dictionary batch data must be 
object"));
+    NANOARROW_RETURN_NOT_OK(Check(batch.contains("columns"), error,
+                                  "dictionary batch missing key 'columns'"));
+    NANOARROW_RETURN_NOT_OK(
+        Check(batch.contains("count"), error, "dictionary batch missing key 
'count'"));
+
+    const auto& batch_columns = batch["columns"];
+    NANOARROW_RETURN_NOT_OK(Check(batch_columns.is_array() && 
batch_columns.size() == 1,
+                                  error,
+                                  "dictionary batch columns must be array of 
size 1"));
+
+    const auto& batch_count = batch["count"];
+    NANOARROW_RETURN_NOT_OK(Check(batch_count.is_number_integer(), error,
+                                  "dictionary batch count must be integer"));
+
+    dictionaries_.RecordArray(id_int, batch_count.get<int32_t>(),
+                              batch_columns[0].dump());
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode SetArrayColumn(const json& value, const ArrowSchema* schema,
+                                ArrowArrayView* array_view, ArrowArray* array,
+                                ArrowError* error,
                                 const std::string& parent_error_prefix = "") {
     NANOARROW_RETURN_NOT_OK(
         Check(value.is_object(), error, "Expected Column to be a JSON 
object"));
@@ -1388,7 +1697,8 @@ class TestingJSONReader {
                                     error_prefix + "children has incorrect 
size"));
 
       for (int64_t i = 0; i < array_view->n_children; i++) {
-        NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], 
array_view->children[i],
+        NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], 
schema->children[i],
+                                               array_view->children[i],
                                                array->children[i], error, 
error_prefix));
       }
     }
@@ -1416,6 +1726,28 @@ class TestingJSONReader {
       ArrowBufferView* buffer_view = array_view->buffer_views + i;
       buffer_view->data.as_uint8 = buffer->data;
       buffer_view->size_bytes = buffer->size_bytes;
+
+      // If this is a validity buffer with a big enough size, set the 
array_view's
+      // null_count
+      if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY 
&&
+          _ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
+        array_view->null_count =
+            array_view->length -
+            ArrowBitCountSet(buffer_view->data.as_uint8, 0, 
array_view->length);
+      }
+    }
+
+    // If there is a dictionary associated with schema, parse its value into 
dictionary
+    if (schema->dictionary != nullptr) {
+      NANOARROW_RETURN_NOT_OK(Check(
+          dictionaries_.HasDictionaryForSchema(schema->dictionary), error,
+          error_prefix +
+              "dictionary could not be resolved from dictionary id in 
SetArrayColumn()"));
+
+      const internal::Dictionary& dict = dictionaries_.Get(schema->dictionary);
+      NANOARROW_RETURN_NOT_OK(SetArrayColumn(
+          json::parse(dict.column_json), schema->dictionary, 
array_view->dictionary,
+          array->dictionary, error, error_prefix + "-> <dictionary> "));
     }
 
     // Validate the array view
@@ -1424,9 +1756,10 @@ class TestingJSONReader {
         error_prefix + "failed to validate: "));
 
     // Flush length and buffer pointers to the Array
-    array->length = array_view->length;
     NANOARROW_RETURN_NOT_OK_WITH_ERROR(
         ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, 
nullptr), error);
+    array->length = array_view->length;
+    array->null_count = array_view->null_count;
 
     return NANOARROW_OK;
   }
@@ -1885,6 +2218,9 @@ class TestingJSONComparison {
   ArrowErrorCode CompareSchema(const ArrowSchema* actual, const ArrowSchema* 
expected,
                                ArrowError* error = nullptr,
                                const std::string& path = "") {
+    writer_actual_.ResetDictionaries();
+    writer_expected_.ResetDictionaries();
+
     // Compare the top-level schema "manually" because (1) map type needs 
special-cased
     // comparison and (2) it's easier to read the output if differences are 
separated
     // by field.
@@ -1926,13 +2262,13 @@ class TestingJSONComparison {
 
     // Compare metadata
     std::stringstream ss;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss, 
actual->metadata),
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, 
actual->metadata),
                                        error);
     std::string actual_metadata = ss.str();
 
     ss.str("");
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss, 
expected->metadata),
-                                       error);
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+        writer_expected_.WriteMetadata(ss, expected->metadata), error);
     std::string expected_metadata = ss.str();
 
     if (actual_metadata != expected_metadata) {
@@ -1960,6 +2296,14 @@ class TestingJSONComparison {
       return EINVAL;
     }
 
+    // "Write" the schema using both writers to ensure dictionary ids can be 
resolved
+    // using the ArrowSchema* pointers from schema_
+    std::stringstream ss;
+    writer_actual_.ResetDictionaries();
+    writer_expected_.ResetDictionaries();
+    writer_actual_.WriteSchema(ss, schema_.get());
+    writer_expected_.WriteSchema(ss, schema_.get());
+
     return NANOARROW_OK;
   }
 
@@ -1993,7 +2337,8 @@ class TestingJSONComparison {
   }
 
  private:
-  TestingJSONWriter writer_;
+  TestingJSONWriter writer_actual_;
+  TestingJSONWriter writer_expected_;
   std::vector<Difference> differences_;
   nanoarrow::UniqueSchema schema_;
   nanoarrow::UniqueArrayView actual_;
@@ -2019,11 +2364,11 @@ class TestingJSONComparison {
                                   ArrowError* error, const std::string& path = 
"") {
     std::stringstream ss;
 
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, expected), 
error);
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss, 
expected), error);
     std::string expected_json = ss.str();
 
     ss.str("");
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, actual), error);
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual), 
error);
     std::string actual_json = ss.str();
 
     if (actual_json != expected_json) {
@@ -2036,13 +2381,31 @@ class TestingJSONComparison {
   ArrowErrorCode CompareColumn(ArrowSchema* schema, ArrowArrayView* actual,
                                ArrowArrayView* expected, ArrowError* error,
                                const std::string& path = "") {
-    std::stringstream ss;
+    // Compare children and dictionaries first, then higher-level structures 
after.
+    // This is a redundant because the higher-level serialized JSON will also 
report
+    // a difference if deeply nested children have differences; however, it 
will not
+    // contain dictionaries and this output is slightly better (more targeted 
differences
+    // that are slightly easier to read appear first).
+    for (int64_t i = 0; i < schema->n_children; i++) {
+      NANOARROW_RETURN_NOT_OK(
+          CompareColumn(schema->children[i], actual->children[i], 
expected->children[i],
+                        error, path + ".children[" + std::to_string(i) + "]"));
+    }
 
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema, 
expected), error);
+    if (schema->dictionary != nullptr) {
+      NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary, 
actual->dictionary,
+                                            expected->dictionary, error,
+                                            path + ".dictionary"));
+    }
+
+    std::stringstream ss;
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss, 
schema, expected),
+                                       error);
     std::string expected_json = ss.str();
 
     ss.str("");
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema, 
actual), error);
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema, 
actual),
+                                       error);
     std::string actual_json = ss.str();
 
     if (actual_json != expected_json) {
diff --git a/src/nanoarrow/nanoarrow_testing_test.cc 
b/src/nanoarrow/nanoarrow_testing_test.cc
index 217bf9d..248ec69 100644
--- a/src/nanoarrow/nanoarrow_testing_test.cc
+++ b/src/nanoarrow/nanoarrow_testing_test.cc
@@ -406,6 +406,21 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldBasic) 
{
       R"({"name": "colname", "nullable": true, "type": {"name": "null"}, 
"children": []})");
 }
 
+TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDict) {
+  TestWriteJSON(
+      [](ArrowSchema* schema) {
+        NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, 
NANOARROW_TYPE_INT16));
+        NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(schema));
+        NANOARROW_RETURN_NOT_OK(
+            ArrowSchemaInitFromType(schema->dictionary, 
NANOARROW_TYPE_STRING));
+        return NANOARROW_OK;
+      },
+      [](ArrowArray* array) { return NANOARROW_OK; }, &WriteFieldJSON,
+      R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+      R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 16, 
"isSigned": true}, )"
+      R"("isOrdered": false}, "children": []})");
+}
+
 TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldMetadata) {
   // Missing metadata
   TestWriteJSON(
@@ -805,6 +820,37 @@ TEST(NanoarrowTestingTest, 
NanoarrowTestingTestReadFieldNested) {
   EXPECT_STREQ(schema->children[0]->format, "n");
 }
 
+TEST(NanoarrowTestingTest, NanoarrowTestingTestReadFieldDictionary) {
+  nanoarrow::UniqueSchema schema;
+  TestingJSONReader reader;
+
+  // Unordered
+  ASSERT_EQ(
+      reader.ReadField(
+          R"({"name": "col1", "nullable": true, "type": {"name": "utf8"}, 
"children": [], )"
+          R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 
32, "isSigned": true}, "isOrdered": false}})",
+          schema.get()),
+      NANOARROW_OK);
+  EXPECT_STREQ(schema->format, "i");
+  EXPECT_STREQ(schema->name, "col1");
+  EXPECT_TRUE(schema->flags & ARROW_FLAG_NULLABLE);
+  EXPECT_FALSE(schema->flags & ARROW_FLAG_DICTIONARY_ORDERED);
+  ASSERT_NE(schema->dictionary, nullptr);
+  EXPECT_STREQ(schema->dictionary->format, "u");
+  EXPECT_EQ(schema->dictionary->name, nullptr);
+  EXPECT_EQ(schema->dictionary->dictionary, nullptr);
+
+  // Ordered
+  schema.reset();
+  ASSERT_EQ(
+      reader.ReadField(
+          R"({"name": "col1", "nullable": true, "type": {"name": "utf8"}, 
"children": [], )"
+          R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 
32, "isSigned": true}, "isOrdered": true}})",
+          schema.get()),
+      NANOARROW_OK);
+  EXPECT_TRUE(schema->flags & ARROW_FLAG_DICTIONARY_ORDERED);
+}
+
 TEST(NanoarrowTestingTest, NanoarrowTestingTestRoundtripDataFile) {
   nanoarrow::UniqueArrayStream stream;
   ArrowError error;
@@ -850,6 +896,33 @@ TEST(NanoarrowTestingTest, 
NanoarrowTestingTestRoundtripDataFile) {
   ASSERT_EQ(reader.ReadDataFile("{", stream.get()), EINVAL);
 }
 
+TEST(NanoarrowTestingTest, NanoarrowTestingTestRoundtripDataFileDictionary) {
+  nanoarrow::UniqueArrayStream stream;
+  ArrowError error;
+  error.message[0] = '\0';
+
+  std::string data_file_json =
+      R"({"schema": {"fields": [{"name": null, "nullable": true, "type": 
{"name": "binary"}, )"
+      R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 32, 
"isSigned": true}, "isOrdered": false}, "children": []}, )"
+      R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+      R"("dictionary": {"id": 1, "indexType": {"name": "int", "bitWidth": 8, 
"isSigned": true}, "isOrdered": false}, "children": []}]}, )"
+      R"("batches": [{"count": 1, "columns": [{"name": null, "count": 1, 
"VALIDITY": [1], "DATA": [0]}, )"
+      R"({"name": null, "count": 1, "VALIDITY": [1], "DATA": [1]}]}], )"
+      R"("dictionaries": [{"id": 0, "data": {"count": 1, "columns": [{"name": 
null, "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA": ["616263"]}]}}, )"
+      R"({"id": 1, "data": {"count": 2, "columns": [{"name": null, "count": 2, 
"VALIDITY": [1, 1], "OFFSET": [0, 3, 6], "DATA": ["abc", "def"]}]}}]})";
+
+  TestingJSONReader reader;
+  ASSERT_EQ(reader.ReadDataFile(data_file_json, stream.get(),
+                                TestingJSONReader::kNumBatchReadAll, &error),
+            NANOARROW_OK)
+      << error.message;
+
+  TestingJSONWriter writer;
+  std::stringstream data_file_json_roundtrip;
+  ASSERT_EQ(writer.WriteDataFile(data_file_json_roundtrip, stream.get()), 
NANOARROW_OK);
+  EXPECT_EQ(data_file_json_roundtrip.str(), data_file_json);
+}
+
 TEST(NanoarrowTestingTest, NanoarrowTestingTestReadBatch) {
   nanoarrow::UniqueSchema schema;
   nanoarrow::UniqueArray array;
@@ -1178,6 +1251,20 @@ TEST(NanoarrowTestingTest, 
NanoarrowTestingTestFieldUnion) {
                 "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'");
 }
 
+TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDictionaryRoundtrip) {
+  // Unordered
+  TestFieldRoundtrip(
+      R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+      R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 16, 
"isSigned": true}, )"
+      R"("isOrdered": false}, "children": []})");
+
+  // Ordered
+  TestFieldRoundtrip(
+      R"({"name": null, "nullable": true, "type": {"name": "utf8"}, )"
+      R"("dictionary": {"id": 0, "indexType": {"name": "int", "bitWidth": 16, 
"isSigned": true}, )"
+      R"("isOrdered": true}, "children": []})");
+}
+
 void AssertSchemasCompareEqual(ArrowSchema* actual, ArrowSchema* expected) {
   TestingJSONComparison comparison;
   std::stringstream msg;
@@ -1346,6 +1433,61 @@ TEST(NanoarrowTestingTest, 
NanoarrowTestingTestArrayComparison) {
 )");
 }
 
+TEST(NanoarrowTestingTest, NanoarrowTestingTestArrayWithDictionaryComparison) {
+  nanoarrow::UniqueSchema schema;
+  nanoarrow::UniqueArray actual;
+  nanoarrow::UniqueArray expected;
+
+  TestingJSONComparison comparison;
+  std::stringstream msg;
+
+  ArrowSchemaInit(schema.get());
+  ASSERT_EQ(ArrowSchemaSetTypeStruct(schema.get(), 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema->children[0], NANOARROW_TYPE_INT32), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaAllocateDictionary(schema->children[0]), NANOARROW_OK);
+  ASSERT_EQ(
+      ArrowSchemaInitFromType(schema->children[0]->dictionary, 
NANOARROW_TYPE_STRING),
+      NANOARROW_OK);
+  ASSERT_EQ(comparison.SetSchema(schema.get()), NANOARROW_OK);
+
+  // Dictionary-encoded with one element
+  ASSERT_EQ(ArrowArrayInitFromSchema(expected.get(), schema.get(), nullptr),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(expected.get()), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(expected->children[0], 0), NANOARROW_OK);
+  ASSERT_EQ(
+      ArrowArrayAppendString(expected->children[0]->dictionary, 
ArrowCharView("abc")),
+      NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishElement(expected.get()), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuildingDefault(expected.get(), nullptr), 
NANOARROW_OK);
+
+  // Dictionary-encoded with one element with the only difference in the 
dictionary
+  ASSERT_EQ(ArrowArrayInitFromSchema(actual.get(), schema.get(), nullptr), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(actual.get()), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(actual->children[0], 0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendString(actual->children[0]->dictionary, 
ArrowCharView("def")),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishElement(actual.get()), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuildingDefault(actual.get(), nullptr), 
NANOARROW_OK);
+
+  // Compare array with dictionary that has no differences
+  ASSERT_EQ(comparison.CompareBatch(actual.get(), actual.get()), NANOARROW_OK);
+  EXPECT_EQ(comparison.num_differences(), 0);
+  comparison.ClearDifferences();
+
+  // Compare arrays with nested difference in the dictionary
+  ArrowError error;
+  ASSERT_EQ(comparison.CompareBatch(actual.get(), expected.get(), &error), 
NANOARROW_OK)
+      << error.message;
+  EXPECT_EQ(comparison.num_differences(), 1);
+  comparison.WriteDifferences(msg);
+  EXPECT_EQ(msg.str(), R"(Path: .children[0].dictionary
+- {"name": null, "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA": 
["def"]}
++ {"name": null, "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA": 
["abc"]}
+
+)");
+}
+
 ArrowErrorCode MakeArrayStream(const ArrowSchema* schema,
                                std::vector<std::string> batches_json,
                                ArrowArrayStream* out) {

(arrow-nanoarrow) branch main updated: feat: Add dictionary support in integration test utility (#342)

Reply via email to