kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491304867



##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -329,186 +300,106 @@ struct ValueConverter<DurationType> {
         default:
           return Status::UnknownError("Invalid time unit");
       }
+    } else if (PyArray_CheckAnyScalarExact(obj)) {
+      // validate that the numpy scalar has np.datetime64 dtype
+      std::shared_ptr<DataType> numpy_type;
+      RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), 
&numpy_type));
+      if (!numpy_type->Equals(*type)) {
+        return Status::NotImplemented("Expected np.timedelta64 but got: ",
+                                      numpy_type->ToString());
+      }
+      return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval;
     } else {
       RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
     }
     return value;
   }
 
-  static inline Result<int64_t> FromNumpy(PyObject* obj, TimeUnit::type unit) {
-    // validate that the numpy scalar has np.timedelta64 dtype
-    std::shared_ptr<DataType> type;
-    RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
-    if (type->id() != DurationType::type_id) {
-      // TODO(kszucs): the message should highlight the received numpy dtype
-      return Status::Invalid("Expected np.timedelta64 but got: ", 
type->ToString());
-    }
-    // validate that the time units are matching
-    if (unit != checked_cast<const DurationType&>(*type).unit()) {
-      return Status::NotImplemented(
-          "Cannot convert NumPy np.timedelta64 objects with differing unit");
-    }
-    // convert the numpy value
-    return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval;
-  }
-};
-
-template <typename Type>
-struct ValueConverter<Type, enable_if_any_binary<Type>> {
-  static inline Result<PyBytesView> FromPython(PyObject* obj) {
-    PyBytesView view;
-    RETURN_NOT_OK(view.FromString(obj));
-    return std::move(view);
-  }
-};
+  // The binary-like intermediate representation is PyBytesView because it 
keeps temporary
+  // python objects alive (non-contiguous memoryview) and stores whether the 
original
+  // object was unicode encoded or not, which is used for unicode -> bytes 
coersion if
+  // there is a non-unicode object observed.
 
-template <typename Type>
-struct ValueConverter<Type, enable_if_string_like<Type>> {
-  static inline Result<PyBytesView> FromPython(PyObject* obj) {
-    // strict conversion, force output to be unicode / utf8 and validate that
-    // any binary values are utf8
-    bool is_utf8 = false;
-    PyBytesView view;
-
-    RETURN_NOT_OK(view.FromString(obj, &is_utf8));
-    if (!is_utf8) {
-      return internal::InvalidValue(obj, "was not a utf8 string");
-    }
-    return std::move(view);
+  static Result<PyBytesView> Convert(const BaseBinaryType*, const O&, I obj) {
+    return PyBytesView::FromString(obj);
   }
 
-  static inline Result<PyBytesView> FromPython(PyObject* obj, bool* is_utf8) {
-    PyBytesView view;
-
-    // Non-strict conversion; keep track of whether values are unicode or bytes
-    if (PyUnicode_Check(obj)) {
-      *is_utf8 = true;
-      RETURN_NOT_OK(view.FromUnicode(obj));
+  static Result<PyBytesView> Convert(const FixedSizeBinaryType* type, const 
O&, I obj) {
+    ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj));
+    if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) {
+      return std::move(view);
     } else {
-      // If not unicode or bytes, FromBinary will error
-      *is_utf8 = false;
-      RETURN_NOT_OK(view.FromBinary(obj));
-    }
-    return std::move(view);
-  }
-};
-
-template <typename Type>
-struct ValueConverter<Type, enable_if_fixed_size_binary<Type>> {
-  static inline Result<PyBytesView> FromPython(PyObject* obj, int32_t 
byte_width) {
-    PyBytesView view;
-    RETURN_NOT_OK(view.FromString(obj));
-    if (ARROW_PREDICT_FALSE(view.size != byte_width)) {
       std::stringstream ss;
-      ss << "expected to be length " << byte_width << " was " << view.size;
+      ss << "expected to be length " << type->byte_width() << " was " << 
view.size;
       return internal::InvalidValue(obj, ss.str());
-    } else {
-      return std::move(view);
     }
   }
-};
-
-// ----------------------------------------------------------------------
-// Sequence converter base and CRTP "middle" subclasses
 
-class SeqConverter;
-
-// Forward-declare converter factory
-Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
-                    bool strict_conversions, bool ignore_timezone,
-                    std::unique_ptr<SeqConverter>* out);
-
-// Marshal Python sequence (list, tuple, etc.) to Arrow array
-class SeqConverter {
- public:
-  virtual ~SeqConverter() = default;
-
-  // Initialize the sequence converter with an ArrayBuilder created
-  // externally. The reason for this interface is that we have
-  // arrow::MakeBuilder which also creates child builders for nested types, so
-  // we have to pass in the child builders to child SeqConverter in the case of
-  // converting Python objects to Arrow nested types
-  virtual Status Init(ArrayBuilder* builder) = 0;
-
-  // Append a single null value to the builder
-  virtual Status AppendNull() = 0;
-
-  // Append a valid value
-  virtual Status AppendValue(PyObject* seq) = 0;
-
-  // Append a single python object handling Null values
-  virtual Status Append(PyObject* seq) = 0;
-
-  // Append the contents of a Python sequence to the underlying builder,
-  // virtual version
-  virtual Status Extend(PyObject* seq, int64_t size) = 0;
-
-  // Append the contents of a Python sequence to the underlying builder,
-  // virtual version
-  virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0;
-
-  virtual Status Close() {
-    if (chunks_.size() == 0 || builder_->length() > 0) {
-      std::shared_ptr<Array> last_chunk;
-      RETURN_NOT_OK(builder_->Finish(&last_chunk));
-      chunks_.emplace_back(std::move(last_chunk));
+  template <typename T>
+  static enable_if_string<T, Result<PyBytesView>> Convert(const T*, const O& 
options,
+                                                          I obj) {
+    if (options.strict) {
+      // Strict conversion, force output to be unicode / utf8 and validate that
+      // any binary values are utf8
+      ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj, true));
+      if (!view.is_utf8) {
+        return internal::InvalidValue(obj, "was not a utf8 string");
+      }
+      return std::move(view);
+    } else {
+      // Non-strict conversion; keep track of whether values are unicode or 
bytes
+      return PyBytesView::FromString(obj);
     }
-    return Status::OK();
   }
 
-  virtual Status GetResult(std::shared_ptr<ChunkedArray>* out) {
-    // Still some accumulated data in the builder. If there are no chunks, we
-    // always call Finish to deal with the edge case where a size-0 sequence
-    // was converted with a specific output type, like array([], type=t)
-    RETURN_NOT_OK(Close());
-    *out = std::make_shared<ChunkedArray>(this->chunks_, builder_->type());
-    return Status::OK();
+  static Result<bool> Convert(const DataType* type, const O&, I obj) {
+    return Status::NotImplemented("PyValue::Convert is not implemented for 
type ", type);
   }
-
-  ArrayBuilder* builder() const { return builder_; }
-
-  int num_chunks() const { return static_cast<int>(chunks_.size()); }
-
- protected:
-  ArrayBuilder* builder_;
-  bool unfinished_builder_;
-  std::vector<std::shared_ptr<Array>> chunks_;
 };
 
-template <typename Type, NullCoding null_coding = NullCoding::NONE_ONLY>
-class TypedConverter : public SeqConverter {
- public:
-  using BuilderType = typename TypeTraits<Type>::BuilderType;
+// Forward-declare the type-family specific converters to inject them to the 
PyConverter
+// base class as type aliases.
 
-  Status Init(ArrayBuilder* builder) override {
-    builder_ = builder;
-    DCHECK_NE(builder_, nullptr);
-    typed_builder_ = checked_cast<BuilderType*>(builder);
-    return Status::OK();
-  }
+template <typename T, typename Enable = void>
+class PyPrimitiveConverter;
 
-  // Append a missing item (default implementation)
-  Status AppendNull() override { return this->typed_builder_->AppendNull(); }
+template <typename T, typename Enable = void>
+class PyDictionaryConverter;
 
-  // Append null if the obj is None or pandas null otherwise the valid value
-  Status Append(PyObject* obj) override {
-    return NullChecker<null_coding>::Check(obj) ? AppendNull() : 
AppendValue(obj);
-  }
+template <typename T>
+class PyListConverter;
 
-  Status Extend(PyObject* obj, int64_t size) override {
+class PyStructConverter;
+
+// The base Converter class is a mixin with predefined behavior and 
constructors.
+class PyConverter : public Converter<PyObject*, PyConversionOptions, 
PyConverter> {
+ public:
+  // Type aliases used by the parent Converter mixin's factory.
+  template <typename T>
+  using Primitive = PyPrimitiveConverter<T>;
+  template <typename T>
+  using Dictionary = PyDictionaryConverter<T>;
+  template <typename T>
+  using List = PyListConverter<T>;
+  using Struct = PyStructConverter;
+
+  // Convert and append a sequence of values
+  Status Extend(PyObject* values, int64_t size) {

Review comment:
       Indeed, why I wasn't thinking of this before.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to