wesm commented on a change in pull request #7992:
URL: https://github.com/apache/arrow/pull/7992#discussion_r474285549



##########
File path: cpp/src/arrow/flight/internal.cc
##########
@@ -465,11 +465,9 @@ Status FromProto(const pb::SchemaResult& pb_result, 
std::string* result) {
 }
 
 Status SchemaToString(const Schema& schema, std::string* out) {
-  // TODO(wesm): Do we care about better memory efficiency here?
   ipc::DictionaryMemo unused_dict_memo;

Review comment:
       This can be removed now?

##########
File path: cpp/src/arrow/ipc/dictionary.h
##########
@@ -21,96 +21,145 @@
 
 #include <cstdint>
 #include <memory>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include "arrow/memory_pool.h"
+#include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+namespace ipc {
+
+class FieldPosition {
+ public:
+  FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}
+
+  FieldPosition child(int index) const { return {this, index}; }
+
+  std::vector<int> path() const {
+    std::vector<int> path(depth_);
+    const FieldPosition* cur = this;
+    for (int i = depth_ - 1; i >= 0; --i) {
+      path[i] = cur->index_;
+      cur = cur->parent_;
+    }
+    return path;
+  }
+
+ protected:
+  FieldPosition(const FieldPosition* parent, int index)
+      : parent_(parent), index_(index), depth_(parent->depth_ + 1) {}
+
+  const FieldPosition* parent_;
+  int index_;
+  int depth_;
+};
 
-class Array;
-class DataType;
-class Field;
-class RecordBatch;
+/// \brief Map fields in a schema to dictionary ids
+///
+/// The mapping is structural, i.e. the field path (as a vector of indices)
+/// is associated to the dictionary id.

Review comment:
       It would be helpful to state that a dictionary id can be associated with 
multiple field paths

##########
File path: cpp/src/arrow/ipc/metadata_internal.h
##########
@@ -48,6 +48,7 @@ namespace flatbuf = org::apache::arrow::flatbuf;
 
 namespace ipc {
 
+class DictionaryFieldMapper;
 class DictionaryMemo;

Review comment:
       Perhaps we should move these both to `ipc::internal`?

##########
File path: cpp/src/arrow/ipc/dictionary.h
##########
@@ -21,96 +21,145 @@
 
 #include <cstdint>
 #include <memory>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include "arrow/memory_pool.h"
+#include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+namespace ipc {
+
+class FieldPosition {
+ public:
+  FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}
+
+  FieldPosition child(int index) const { return {this, index}; }
+
+  std::vector<int> path() const {
+    std::vector<int> path(depth_);
+    const FieldPosition* cur = this;
+    for (int i = depth_ - 1; i >= 0; --i) {
+      path[i] = cur->index_;
+      cur = cur->parent_;
+    }
+    return path;
+  }
+
+ protected:
+  FieldPosition(const FieldPosition* parent, int index)
+      : parent_(parent), index_(index), depth_(parent->depth_ + 1) {}
+
+  const FieldPosition* parent_;
+  int index_;
+  int depth_;
+};
 
-class Array;
-class DataType;
-class Field;
-class RecordBatch;
+/// \brief Map fields in a schema to dictionary ids
+///
+/// The mapping is structural, i.e. the field path (as a vector of indices)
+/// is associated to the dictionary id.
+class ARROW_EXPORT DictionaryFieldMapper {
+ public:
+  DictionaryFieldMapper();
+  explicit DictionaryFieldMapper(const Schema& schema);
+  ~DictionaryFieldMapper();
 
-namespace ipc {
+  Status AddSchemaFields(const Schema& schema);
+  Status AddField(int64_t id, std::vector<int> field_path);
 
-/// \brief Memoization data structure for assigning id numbers to
-/// dictionaries and tracking their current state through possible
-/// deltas in an IPC stream
+  Result<int64_t> GetFieldId(std::vector<int> field_path) const;
+
+  int num_fields() const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+using DictionaryVector = std::vector<std::pair<int64_t, 
std::shared_ptr<Array>>>;
+
+/// \brief Memoization data structure for reading dictionaries from IPC streams
+///
+/// This structure tracks the following associations:
+/// - field position (structural) -> dictionary id
+/// - dictionary id -> value type
+/// - dictionary id -> dictionary (value) data
+///
+/// Together, they allow resolving dictionary data when reading an IPC stream,
+/// using metadata recorded in the schema message and data recorded in the
+/// dictionary batch messages (see ResolveDictionaries).
+///
+/// This structure isn't useful for writing an IPC stream, where only
+/// DictionaryFieldMapper is necessary.
 class ARROW_EXPORT DictionaryMemo {
  public:
-  using DictionaryVector = std::vector<std::pair<int64_t, 
std::shared_ptr<Array>>>;
-
   DictionaryMemo();
-  DictionaryMemo(DictionaryMemo&&) = default;
-  DictionaryMemo& operator=(DictionaryMemo&&) = default;
+  ~DictionaryMemo();
+
+  DictionaryFieldMapper& fields();

Review comment:
       Is this necessary? It might be clearer to have `DictionaryFieldMapper* 
mutable_fields()`

##########
File path: cpp/src/arrow/ipc/dictionary.h
##########
@@ -21,34 +21,75 @@
 
 #include <cstdint>
 #include <memory>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include "arrow/memory_pool.h"
+#include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+namespace ipc {
 
-class Array;
-class DataType;
-class Field;
-class RecordBatch;
+class FieldPosition {
+ public:
+  FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}

Review comment:
       Seems reasonable to me

##########
File path: cpp/src/arrow/ipc/dictionary.cc
##########
@@ -30,6 +31,14 @@
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace std {
+template <>
+struct hash<arrow::FieldPath> {
+  size_t operator()(const arrow::FieldPath& path) const { return path.hash(); }
+};

Review comment:
       This might be better placed in a common header, but this is ok for now




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to