pitrou commented on a change in pull request #7992:
URL: https://github.com/apache/arrow/pull/7992#discussion_r477341153



##########
File path: cpp/src/arrow/ipc/dictionary.h
##########
@@ -21,96 +21,145 @@
 
 #include <cstdint>
 #include <memory>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include "arrow/memory_pool.h"
+#include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+namespace ipc {
+
+class FieldPosition {
+ public:
+  FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}
+
+  FieldPosition child(int index) const { return {this, index}; }
+
+  std::vector<int> path() const {
+    std::vector<int> path(depth_);
+    const FieldPosition* cur = this;
+    for (int i = depth_ - 1; i >= 0; --i) {
+      path[i] = cur->index_;
+      cur = cur->parent_;
+    }
+    return path;
+  }
+
+ protected:
+  FieldPosition(const FieldPosition* parent, int index)
+      : parent_(parent), index_(index), depth_(parent->depth_ + 1) {}
+
+  const FieldPosition* parent_;
+  int index_;
+  int depth_;
+};
 
-class Array;
-class DataType;
-class Field;
-class RecordBatch;
+/// \brief Map fields in a schema to dictionary ids
+///
+/// The mapping is structural, i.e. the field path (as a vector of indices)
+/// is associated to the dictionary id.
+class ARROW_EXPORT DictionaryFieldMapper {
+ public:
+  DictionaryFieldMapper();
+  explicit DictionaryFieldMapper(const Schema& schema);
+  ~DictionaryFieldMapper();
 
-namespace ipc {
+  Status AddSchemaFields(const Schema& schema);
+  Status AddField(int64_t id, std::vector<int> field_path);
 
-/// \brief Memoization data structure for assigning id numbers to
-/// dictionaries and tracking their current state through possible
-/// deltas in an IPC stream
+  Result<int64_t> GetFieldId(std::vector<int> field_path) const;
+
+  int num_fields() const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+using DictionaryVector = std::vector<std::pair<int64_t, 
std::shared_ptr<Array>>>;
+
+/// \brief Memoization data structure for reading dictionaries from IPC streams
+///
+/// This structure tracks the following associations:
+/// - field position (structural) -> dictionary id
+/// - dictionary id -> value type
+/// - dictionary id -> dictionary (value) data
+///
+/// Together, they allow resolving dictionary data when reading an IPC stream,
+/// using metadata recorded in the schema message and data recorded in the
+/// dictionary batch messages (see ResolveDictionaries).
+///
+/// This structure isn't useful for writing an IPC stream, where only
+/// DictionaryFieldMapper is necessary.
 class ARROW_EXPORT DictionaryMemo {
  public:
-  using DictionaryVector = std::vector<std::pair<int64_t, 
std::shared_ptr<Array>>>;
-
   DictionaryMemo();
-  DictionaryMemo(DictionaryMemo&&) = default;
-  DictionaryMemo& operator=(DictionaryMemo&&) = default;
+  ~DictionaryMemo();
+
+  DictionaryFieldMapper& fields();

Review comment:
       Well, either this, or we duplicate the `DictionaryFieldMapper` APIs 
here. I have no particular preference.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to