[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417342700



##
File path: cpp/src/arrow/dataset/filter.cc
##
@@ -1261,7 +1264,212 @@ Result> 
TreeEvaluator::Filter(
   return batch->Slice(0, 0);
 }
 
-std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+
+struct SerializeImpl {
+  Result> ToArray(const Expression& expr) const {
+return VisitExpression(expr, *this);
+  }
+
+  Result> TaggedWithChildren(const Expression& 
expr,
+  ArrayVector 
children) const {
+children.emplace_back();
+ARROW_ASSIGN_OR_RAISE(children.back(),
+  MakeArrayFromScalar(Int32Scalar(expr.type()), 1));
+
+return StructArray::Make(children, 
std::vector(children.size(), ""));
+  }
+
+  Result> operator()(const FieldExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto name, 
MakeArrayFromScalar(StringScalar(expr.name()), 1));
+return TaggedWithChildren(expr, {name});
+  }
+
+  Result> operator()(const ScalarExpression& 
expr) const {
+ARROW_ASSIGN_OR_RAISE(auto value, MakeArrayFromScalar(*expr.value(), 1));
+return TaggedWithChildren(expr, {value});
+  }
+
+  Result> operator()(const UnaryExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+return TaggedWithChildren(expr, {operand});
+  }
+
+  Result> operator()(const CastExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+
+std::shared_ptr is_like_expr, to;
+if (const auto& to_type = expr.to_type()) {
+  ARROW_ASSIGN_OR_RAISE(is_like_expr, 
MakeArrayFromScalar(BooleanScalar(false), 1));
+  ARROW_ASSIGN_OR_RAISE(to, MakeArrayOfNull(to_type, 1));
+}
+if (const auto& like_expr = expr.like_expr()) {
+  ARROW_ASSIGN_OR_RAISE(is_like_expr, 
MakeArrayFromScalar(BooleanScalar(true), 1));
+  ARROW_ASSIGN_OR_RAISE(to, ToArray(*like_expr));
+}
+
+return TaggedWithChildren(expr, {operand, is_like_expr, to});
+  }
+
+  Result> operator()(const BinaryExpression& 
expr) const {
+ARROW_ASSIGN_OR_RAISE(auto left_operand, ToArray(*expr.left_operand()));
+ARROW_ASSIGN_OR_RAISE(auto right_operand, ToArray(*expr.right_operand()));
+return TaggedWithChildren(expr, {left_operand, right_operand});
+  }
+
+  Result> operator()(
+  const ComparisonExpression& expr) const {
+ARROW_ASSIGN_OR_RAISE(auto left_operand, ToArray(*expr.left_operand()));
+ARROW_ASSIGN_OR_RAISE(auto right_operand, ToArray(*expr.right_operand()));
+ARROW_ASSIGN_OR_RAISE(auto op, MakeArrayFromScalar(Int32Scalar(expr.op()), 
1));
+return TaggedWithChildren(expr, {left_operand, right_operand, op});
+  }
+
+  Result> operator()(const InExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+
+auto set_type = list(expr.set()->type());
+
+ARROW_ASSIGN_OR_RAISE(auto set_offsets, AllocateBuffer(sizeof(int32_t) * 
2));
+reinterpret_cast(set_offsets->mutable_data())[0] = 0;
+reinterpret_cast(set_offsets->mutable_data())[1] =
+static_cast(expr.set()->length());
+
+auto set_values = expr.set();
+
+auto set = std::make_shared(std::move(set_type), 1, 
std::move(set_offsets),
+   std::move(set_values));
+return TaggedWithChildren(expr, {operand, set});
+  }
+
+  Result> operator()(const Expression& expr) 
const {
+return Status::NotImplemented("serialization of ", expr.ToString());
+  }
+};
+
+Result> Expression::Serialize() const {
+  ARROW_ASSIGN_OR_RAISE(auto array, SerializeImpl{}.ToArray(*this));
+  ARROW_ASSIGN_OR_RAISE(auto batch, RecordBatch::FromStructArray(array));
+  ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+  ARROW_ASSIGN_OR_RAISE(auto writer, ipc::NewFileWriter(stream.get(), 
batch->schema()));
+  RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+  RETURN_NOT_OK(writer->Close());
+  return stream->Finish();
+}
+
+struct DeserializeImpl {
+  Result> FromArray(const Array& array) const {
+if (array.type_id() != Type::STRUCT || array.length() != 1) {
+  return Status::Invalid("can only deserialize expressions from 
unit-length",
+ " StructArray, got ", array);
+}
+const auto& struct_array = checked_cast(array);
+
+ARROW_ASSIGN_OR_RAISE(auto expression_type, 
GetExpressionType(struct_array));
+switch (expression_type) {
+  case ExpressionType::FIELD: {
+ARROW_ASSIGN_OR_RAISE(auto name, GetView(struct_array, 0));
+return field_ref(name.to_string());
+  }
+
+  case ExpressionType::SCALAR: {
+ARROW_ASSIGN_OR_RAISE(auto value,
+  Scalar::FromArraySlot(*struct_array.field(0), 
0));
+return scalar(std::move(value));
+  }
+
+

[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417375378



##
File path: cpp/src/arrow/scalar.cc
##
@@ -252,6 +270,100 @@ Result> Scalar::Parse(const 
std::shared_ptr& t
   return ScalarParseImpl{type, s}.Finish();
 }
 
+struct ScalarFromArraySlotImpl {
+  template 
+  using ScalarType = typename TypeTraits::ScalarType;
+
+  Status Visit(const NullArray& a) {
+out_ = std::make_shared();
+return Status::OK();
+  }
+
+  Status Visit(const BooleanArray& a) { return Finish(a.Value(index_)); }
+
+  template 
+  Status Visit(const NumericArray& a) {
+return Finish(a.Value(index_));
+  }
+
+  template 
+  Status Visit(const BaseBinaryArray& a) {
+return Finish(a.GetString(index_));

Review comment:
   Buffer::FromString wraps the allocation of the std::string without 
copying





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417341608



##
File path: cpp/src/arrow/scalar.cc
##
@@ -252,6 +270,100 @@ Result> Scalar::Parse(const 
std::shared_ptr& t
   return ScalarParseImpl{type, s}.Finish();
 }
 
+struct ScalarFromArraySlotImpl {
+  template 
+  using ScalarType = typename TypeTraits::ScalarType;
+
+  Status Visit(const NullArray& a) {
+out_ = std::make_shared();
+return Status::OK();
+  }
+
+  Status Visit(const BooleanArray& a) { return Finish(a.Value(index_)); }
+
+  template 
+  Status Visit(const NumericArray& a) {
+return Finish(a.Value(index_));
+  }
+
+  template 
+  Status Visit(const BaseBinaryArray& a) {
+return Finish(a.GetString(index_));

Review comment:
   will do





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417364778



##
File path: cpp/src/arrow/dataset/filter.h
##
@@ -191,6 +197,10 @@ class ARROW_DS_EXPORT Expression {
   /// returns a debug string representing this expression
   virtual std::string ToString() const = 0;
 
+  /// serialize/deserialize an Expression.
+  Result> Serialize() const;

Review comment:
   I'll translate the python tests into a case in filter_test.cc





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417358718



##
File path: cpp/src/arrow/util/iterator.h
##
@@ -138,11 +139,11 @@ class Iterator : public 
util::EqualityComparable> {
 value_ = IterationTraits::End();
 return;
   }
-  value_ = iterator_.Next();
+  value_ = iterator_->Next();
 }
 
 Result value_;
-Iterator iterator_;
+std::shared_ptr iterator_;

Review comment:
   This allows `Iterator::RangeIterator` to be copied, which is a 
requirement on iterators https://en.cppreference.com/w/cpp/named_req/Iterator
   
   We haven't caught this before because we only use `Iterator::begin` and 
`Iterator::end` in the context of range-for loops which don't require it. 
However cython wraps the range-for loop does, however, which is why I noticed 
the concept error.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417347149



##
File path: python/pyarrow/_dataset.pyx
##
@@ -270,19 +425,20 @@ cdef class FileSystemDataset(Dataset):
 CFileSystemDataset* filesystem_dataset
 
 def __init__(self, paths_or_selector, schema=None, format=None,
- filesystem=None, partitions=None, root_partition=None):
+ filesystem=None, partitions=None, root_partition=_true):

Review comment:
   Okay, I'll coerce `None` to `_true` in the function body. If you'd 
prefer that we don't coerce `None` expressions to `_true` in python at all, 
we'll need to expose and use C++ overloads which don't accept an expression (so 
the coercion can happen in C++), but that seems less preferred to me.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417342700



##
File path: cpp/src/arrow/dataset/filter.cc
##
@@ -1261,7 +1264,212 @@ Result> 
TreeEvaluator::Filter(
   return batch->Slice(0, 0);
 }
 
-std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+
+struct SerializeImpl {
+  Result> ToArray(const Expression& expr) const {
+return VisitExpression(expr, *this);
+  }
+
+  Result> TaggedWithChildren(const Expression& 
expr,
+  ArrayVector 
children) const {
+children.emplace_back();
+ARROW_ASSIGN_OR_RAISE(children.back(),
+  MakeArrayFromScalar(Int32Scalar(expr.type()), 1));
+
+return StructArray::Make(children, 
std::vector(children.size(), ""));
+  }
+
+  Result> operator()(const FieldExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto name, 
MakeArrayFromScalar(StringScalar(expr.name()), 1));
+return TaggedWithChildren(expr, {name});
+  }
+
+  Result> operator()(const ScalarExpression& 
expr) const {
+ARROW_ASSIGN_OR_RAISE(auto value, MakeArrayFromScalar(*expr.value(), 1));
+return TaggedWithChildren(expr, {value});
+  }
+
+  Result> operator()(const UnaryExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+return TaggedWithChildren(expr, {operand});
+  }
+
+  Result> operator()(const CastExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+
+std::shared_ptr is_like_expr, to;
+if (const auto& to_type = expr.to_type()) {
+  ARROW_ASSIGN_OR_RAISE(is_like_expr, 
MakeArrayFromScalar(BooleanScalar(false), 1));
+  ARROW_ASSIGN_OR_RAISE(to, MakeArrayOfNull(to_type, 1));
+}
+if (const auto& like_expr = expr.like_expr()) {
+  ARROW_ASSIGN_OR_RAISE(is_like_expr, 
MakeArrayFromScalar(BooleanScalar(true), 1));
+  ARROW_ASSIGN_OR_RAISE(to, ToArray(*like_expr));
+}
+
+return TaggedWithChildren(expr, {operand, is_like_expr, to});
+  }
+
+  Result> operator()(const BinaryExpression& 
expr) const {
+ARROW_ASSIGN_OR_RAISE(auto left_operand, ToArray(*expr.left_operand()));
+ARROW_ASSIGN_OR_RAISE(auto right_operand, ToArray(*expr.right_operand()));
+return TaggedWithChildren(expr, {left_operand, right_operand});
+  }
+
+  Result> operator()(
+  const ComparisonExpression& expr) const {
+ARROW_ASSIGN_OR_RAISE(auto left_operand, ToArray(*expr.left_operand()));
+ARROW_ASSIGN_OR_RAISE(auto right_operand, ToArray(*expr.right_operand()));
+ARROW_ASSIGN_OR_RAISE(auto op, MakeArrayFromScalar(Int32Scalar(expr.op()), 
1));
+return TaggedWithChildren(expr, {left_operand, right_operand, op});
+  }
+
+  Result> operator()(const InExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+
+auto set_type = list(expr.set()->type());
+
+ARROW_ASSIGN_OR_RAISE(auto set_offsets, AllocateBuffer(sizeof(int32_t) * 
2));
+reinterpret_cast(set_offsets->mutable_data())[0] = 0;
+reinterpret_cast(set_offsets->mutable_data())[1] =
+static_cast(expr.set()->length());
+
+auto set_values = expr.set();
+
+auto set = std::make_shared(std::move(set_type), 1, 
std::move(set_offsets),
+   std::move(set_values));
+return TaggedWithChildren(expr, {operand, set});
+  }
+
+  Result> operator()(const Expression& expr) 
const {
+return Status::NotImplemented("serialization of ", expr.ToString());
+  }
+};
+
+Result> Expression::Serialize() const {
+  ARROW_ASSIGN_OR_RAISE(auto array, SerializeImpl{}.ToArray(*this));
+  ARROW_ASSIGN_OR_RAISE(auto batch, RecordBatch::FromStructArray(array));
+  ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+  ARROW_ASSIGN_OR_RAISE(auto writer, ipc::NewFileWriter(stream.get(), 
batch->schema()));
+  RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+  RETURN_NOT_OK(writer->Close());
+  return stream->Finish();
+}
+
+struct DeserializeImpl {
+  Result> FromArray(const Array& array) const {
+if (array.type_id() != Type::STRUCT || array.length() != 1) {
+  return Status::Invalid("can only deserialize expressions from 
unit-length",
+ " StructArray, got ", array);
+}
+const auto& struct_array = checked_cast(array);
+
+ARROW_ASSIGN_OR_RAISE(auto expression_type, 
GetExpressionType(struct_array));
+switch (expression_type) {
+  case ExpressionType::FIELD: {
+ARROW_ASSIGN_OR_RAISE(auto name, GetView(struct_array, 0));
+return field_ref(name.to_string());
+  }
+
+  case ExpressionType::SCALAR: {
+ARROW_ASSIGN_OR_RAISE(auto value,
+  Scalar::FromArraySlot(*struct_array.field(0), 
0));
+return scalar(std::move(value));
+  }
+
+

[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417342047



##
File path: cpp/src/arrow/dataset/filter.cc
##
@@ -1261,7 +1264,212 @@ Result> 
TreeEvaluator::Filter(
   return batch->Slice(0, 0);
 }
 
-std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+
+struct SerializeImpl {
+  Result> ToArray(const Expression& expr) const {
+return VisitExpression(expr, *this);
+  }
+
+  Result> TaggedWithChildren(const Expression& 
expr,
+  ArrayVector 
children) const {
+children.emplace_back();
+ARROW_ASSIGN_OR_RAISE(children.back(),
+  MakeArrayFromScalar(Int32Scalar(expr.type()), 1));
+
+return StructArray::Make(children, 
std::vector(children.size(), ""));
+  }
+
+  Result> operator()(const FieldExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto name, 
MakeArrayFromScalar(StringScalar(expr.name()), 1));
+return TaggedWithChildren(expr, {name});
+  }
+
+  Result> operator()(const ScalarExpression& 
expr) const {
+ARROW_ASSIGN_OR_RAISE(auto value, MakeArrayFromScalar(*expr.value(), 1));
+return TaggedWithChildren(expr, {value});
+  }
+
+  Result> operator()(const UnaryExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+return TaggedWithChildren(expr, {operand});
+  }
+
+  Result> operator()(const CastExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+
+std::shared_ptr is_like_expr, to;
+if (const auto& to_type = expr.to_type()) {
+  ARROW_ASSIGN_OR_RAISE(is_like_expr, 
MakeArrayFromScalar(BooleanScalar(false), 1));
+  ARROW_ASSIGN_OR_RAISE(to, MakeArrayOfNull(to_type, 1));
+}
+if (const auto& like_expr = expr.like_expr()) {
+  ARROW_ASSIGN_OR_RAISE(is_like_expr, 
MakeArrayFromScalar(BooleanScalar(true), 1));
+  ARROW_ASSIGN_OR_RAISE(to, ToArray(*like_expr));
+}
+
+return TaggedWithChildren(expr, {operand, is_like_expr, to});
+  }
+
+  Result> operator()(const BinaryExpression& 
expr) const {
+ARROW_ASSIGN_OR_RAISE(auto left_operand, ToArray(*expr.left_operand()));
+ARROW_ASSIGN_OR_RAISE(auto right_operand, ToArray(*expr.right_operand()));
+return TaggedWithChildren(expr, {left_operand, right_operand});
+  }
+
+  Result> operator()(
+  const ComparisonExpression& expr) const {
+ARROW_ASSIGN_OR_RAISE(auto left_operand, ToArray(*expr.left_operand()));
+ARROW_ASSIGN_OR_RAISE(auto right_operand, ToArray(*expr.right_operand()));
+ARROW_ASSIGN_OR_RAISE(auto op, MakeArrayFromScalar(Int32Scalar(expr.op()), 
1));
+return TaggedWithChildren(expr, {left_operand, right_operand, op});
+  }
+
+  Result> operator()(const InExpression& expr) 
const {
+ARROW_ASSIGN_OR_RAISE(auto operand, ToArray(*expr.operand()));
+
+auto set_type = list(expr.set()->type());
+
+ARROW_ASSIGN_OR_RAISE(auto set_offsets, AllocateBuffer(sizeof(int32_t) * 
2));
+reinterpret_cast(set_offsets->mutable_data())[0] = 0;
+reinterpret_cast(set_offsets->mutable_data())[1] =
+static_cast(expr.set()->length());
+
+auto set_values = expr.set();
+
+auto set = std::make_shared(std::move(set_type), 1, 
std::move(set_offsets),
+   std::move(set_values));
+return TaggedWithChildren(expr, {operand, set});
+  }
+
+  Result> operator()(const Expression& expr) 
const {
+return Status::NotImplemented("serialization of ", expr.ToString());
+  }
+};
+
+Result> Expression::Serialize() const {
+  ARROW_ASSIGN_OR_RAISE(auto array, SerializeImpl{}.ToArray(*this));

Review comment:
   will do

##
File path: cpp/src/arrow/dataset/filter.cc
##
@@ -1261,7 +1264,212 @@ Result> 
TreeEvaluator::Filter(
   return batch->Slice(0, 0);
 }
 
-std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+std::shared_ptr scalar(bool value) { return 
scalar(MakeScalar(value)); }
+
+struct SerializeImpl {

Review comment:
   will do





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417341608



##
File path: cpp/src/arrow/scalar.cc
##
@@ -252,6 +270,100 @@ Result> Scalar::Parse(const 
std::shared_ptr& t
   return ScalarParseImpl{type, s}.Finish();
 }
 
+struct ScalarFromArraySlotImpl {
+  template 
+  using ScalarType = typename TypeTraits::ScalarType;
+
+  Status Visit(const NullArray& a) {
+out_ = std::make_shared();
+return Status::OK();
+  }
+
+  Status Visit(const BooleanArray& a) { return Finish(a.Value(index_)); }
+
+  template 
+  Status Visit(const NumericArray& a) {
+return Finish(a.Value(index_));
+  }
+
+  template 
+  Status Visit(const BaseBinaryArray& a) {
+return Finish(a.GetString(index_));

Review comment:
   will do





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417341382



##
File path: cpp/src/arrow/scalar.cc
##
@@ -252,6 +270,100 @@ Result> Scalar::Parse(const 
std::shared_ptr& t
   return ScalarParseImpl{type, s}.Finish();
 }
 
+struct ScalarFromArraySlotImpl {
+  template 
+  using ScalarType = typename TypeTraits::ScalarType;
+
+  Status Visit(const NullArray& a) {
+out_ = std::make_shared();
+return Status::OK();
+  }
+
+  Status Visit(const BooleanArray& a) { return Finish(a.Value(index_)); }
+
+  template 
+  Status Visit(const NumericArray& a) {
+return Finish(a.Value(index_));
+  }
+
+  template 
+  Status Visit(const BaseBinaryArray& a) {
+return Finish(a.GetString(index_));
+  }
+
+  Status Visit(const FixedSizeBinaryArray& a) { return 
Finish(a.GetString(index_)); }
+
+  Status Visit(const DayTimeIntervalArray& a) { return 
Finish(a.Value(index_)); }
+
+  template 
+  Status Visit(const BaseListArray& a) {
+return Finish(a.value_slice(index_));
+  }
+
+  Status Visit(const FixedSizeListArray& a) { return 
Finish(a.value_slice(index_)); }
+
+  Status Visit(const StructArray& a) {
+ScalarVector children;
+for (const auto& child : a.fields()) {
+  children.emplace_back();
+  ARROW_ASSIGN_OR_RAISE(children.back(), Scalar::FromArraySlot(*child, 
index_));
+}
+return Finish(std::move(children));
+  }
+
+  Status Visit(const UnionArray& a) {
+return Status::NotImplemented("Non-null UnionScalar");
+  }
+
+  Status Visit(const DictionaryArray& a) {
+ARROW_ASSIGN_OR_RAISE(auto index, Scalar::FromArraySlot(*a.indices(), 
index_));

Review comment:
   I'll add `int64_t DictionaryArray::GetValueIndex(int64_t row_index)`, 
that's a nice helper





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417340605



##
File path: cpp/src/arrow/type.cc
##
@@ -43,6 +43,49 @@
 #include "arrow/visitor_inline.h"
 
 namespace arrow {
+
+constexpr Type::type NullType::type_id;

Review comment:
   This is ensuring that `/\w+Type::type_id/` is a defined symbol in 
`libarrow.so`. Previously these have only been used within a translation unit 
which links into `libarrow.so`, but now I'm using them in filter.cc which gets 
linked to `libarrow_dataset.so`. If you remove these you'll find the linker 
emits an error because they're undefined in `libarrow.so`





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-29 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r417340926



##
File path: cpp/src/arrow/scalar.h
##
@@ -78,6 +78,8 @@ struct ARROW_EXPORT Scalar : public 
util::EqualityComparable {
   // TODO(bkietz) add compute::CastOptions
   Result> CastTo(std::shared_ptr to) const;
 
+  static Result> FromArraySlot(const Array& array, 
int64_t index);

Review comment:
   I'll move it





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-28 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r416795870



##
File path: python/pyarrow/tests/test_dataset.py
##
@@ -373,125 +354,51 @@ def test_partitioning():
 assert expr.equals(expected)
 
 
-def test_expression():
-a = ds.ScalarExpression(1)
-b = ds.ScalarExpression(1.1)
-c = ds.ScalarExpression(True)
-d = ds.ScalarExpression("string")
-e = ds.ScalarExpression(None)
-
-equal = ds.ComparisonExpression(ds.CompareOperator.Equal, a, b)
-greater = a > b
-assert equal.op == ds.CompareOperator.Equal
-
-and_ = ds.AndExpression(a, b)
-assert and_.left_operand.equals(a)
-assert and_.right_operand.equals(b)
-assert and_.equals(ds.AndExpression(a, b))
-assert and_.equals(and_)
-
-or_ = ds.OrExpression(a, b)
-not_ = ds.NotExpression(ds.OrExpression(a, b))
-is_valid = ds.IsValidExpression(a)
-cast_safe = ds.CastExpression(a, pa.int32())
-cast_unsafe = ds.CastExpression(a, pa.int32(), safe=False)
-in_ = ds.InExpression(a, pa.array([1, 2, 3]))
-
-assert is_valid.operand == a
-assert in_.set_.equals(pa.array([1, 2, 3]))
-assert cast_unsafe.to == pa.int32()
-assert cast_unsafe.safe is False
-assert cast_safe.safe is True
-
-condition = ds.ComparisonExpression(
-ds.CompareOperator.Greater,
-ds.FieldExpression('i64'),
-ds.ScalarExpression(5)
-)
+def test_expression_serialization():
+a = ds.scalar(1)
+b = ds.scalar(1.1)
+c = ds.scalar(True)
+d = ds.scalar("string")
+e = ds.scalar(None)
+
+condition = ds.field('i64') > 5
 schema = pa.schema([
 pa.field('i64', pa.int64()),
 pa.field('f64', pa.float64())
 ])
 assert condition.validate(schema) == pa.bool_()
 
-i64_is_5 = ds.ComparisonExpression(
-ds.CompareOperator.Equal,
-ds.FieldExpression('i64'),
-ds.ScalarExpression(5)
-)
-i64_is_7 = ds.ComparisonExpression(
-ds.CompareOperator.Equal,
-ds.FieldExpression('i64'),
-ds.ScalarExpression(7)
-)
-assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False))
-assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True))
-assert str(condition) == "(i64 > 5:int64)"
-assert "(i64 > 5:int64)" in repr(condition)
+assert condition.assume(ds.field('i64') == 5).equals(
+ds.scalar(False))
 
-all_exprs = [a, b, c, d, e, equal, greater, and_, or_, not_, is_valid,
- cast_unsafe, cast_safe, in_, condition, i64_is_5, i64_is_7]
+assert condition.assume(ds.field('i64') == 7).equals(
+ds.scalar(True))
+
+all_exprs = [a, b, c, d, e, a == b, a > b, a & b, a | b, ~c,
+ d.is_valid(), a.cast(pa.int32(), safe=False),
+ a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]),
+ ds.field('i64') > 5, ds.field('i64') == 5,
+ ds.field('i64') == 7]
 for expr in all_exprs:
+assert isinstance(expr, ds.Expression)
 restored = pickle.loads(pickle.dumps(expr))
 assert expr.equals(restored)
 
 
-def test_expression_ergonomics():
+def test_expression_construction():
 zero = ds.scalar(0)
 one = ds.scalar(1)
 true = ds.scalar(True)
 false = ds.scalar(False)
 string = ds.scalar("string")
 field = ds.field("field")
 
-assert one.equals(ds.ScalarExpression(1))
-assert zero.equals(ds.ScalarExpression(0))
-assert true.equals(ds.ScalarExpression(True))
-assert false.equals(ds.ScalarExpression(False))
-assert string.equals(ds.ScalarExpression("string"))
-assert field.equals(ds.FieldExpression("field"))
-
-expected = ds.AndExpression(ds.ScalarExpression(1), ds.ScalarExpression(0))
-for expr in [one & zero, 1 & zero, one & 0]:
-assert expr.equals(expected)
-
-expected = ds.OrExpression(ds.ScalarExpression(1), ds.ScalarExpression(0))
-for expr in [one | zero, 1 | zero, one | 0]:
-assert expr.equals(expected)
-
-comparison_ops = [
-(operator.eq, ds.CompareOperator.Equal),
-(operator.ne, ds.CompareOperator.NotEqual),
-(operator.ge, ds.CompareOperator.GreaterEqual),
-(operator.le, ds.CompareOperator.LessEqual),
-(operator.lt, ds.CompareOperator.Less),
-(operator.gt, ds.CompareOperator.Greater),
-]
-for op, compare_op in comparison_ops:
-expr = op(zero, one)
-expected = ds.ComparisonExpression(compare_op, zero, one)
-assert expr.equals(expected)
-
-expr = ~true == false
-expected = ds.ComparisonExpression(
-ds.CompareOperator.Equal,
-ds.NotExpression(ds.ScalarExpression(True)),
-ds.ScalarExpression(False)
-)
-assert expr.equals(expected)
-
+zero | one == string
+~true == false

Review comment:
   Since we don't have an alternative way to construct expressions anymore, 
to what would 

[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-28 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r416761725



##
File path: python/pyarrow/tests/test_dataset.py
##
@@ -218,33 +216,21 @@ def test_filesystem_dataset(mockfs):
 
 # validation of required arguments
 with pytest.raises(TypeError, match="incorrect type"):
-ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs)
+ds.FileSystemDataset(paths, schema=None, format=file_format,
+ filesystem=mockfs)

Review comment:
   will do





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-28 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r416760152



##
File path: python/pyarrow/_dataset.pyx
##
@@ -118,21 +280,15 @@ cdef class Dataset:
 ---
 fragments : iterator of Fragment
 """
-cdef:
-CFragmentIterator iterator
-shared_ptr[CFragment] fragment
+cdef CFragmentIterator c_fragments
 
 if filter is None or filter.expr == nullptr:
-iterator = self.dataset.GetFragments()
+c_fragments = self.dataset.GetFragments()
 else:
-iterator = self.dataset.GetFragments(filter.unwrap())
+c_fragments = self.dataset.GetFragments(filter.unwrap())
 
-while True:
-fragment = GetResultValue(iterator.Next())
-if fragment.get() == nullptr:
-raise StopIteration()
-else:
-yield Fragment.wrap(fragment)
+for maybe_fragment in c_fragments:

Review comment:
   actually all elements are `CResult[shared_ptr[CFragment]]`





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-28 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r416758776



##
File path: python/pyarrow/_dataset.pyx
##
@@ -41,6 +42,167 @@ def _forbid_instantiation(klass, subclasses_instead=True):
 raise TypeError(msg)
 
 
+cdef class Expression:
+
+cdef:
+shared_ptr[CExpression] wrapped
+CExpression* expr
+
+def __init__(self, Buffer buffer=None):
+if buffer is not None:
+c_buffer = pyarrow_unwrap_buffer(buffer)
+expr = GetResultValue(CExpression.Deserialize(deref(c_buffer)))
+self.init(expr)

Review comment:
   This would definitely be better but it gives me an error like 
   ```
   E   _pickle.PicklingError: Can't pickle : attribute lookup _deserialize on pyarrow._dataset failed
   ```
   I'll permute until this works.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-28 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r416746022



##
File path: python/pyarrow/_dataset.pyx
##
@@ -41,6 +42,167 @@ def _forbid_instantiation(klass, subclasses_instead=True):
 raise TypeError(msg)
 
 
+cdef class Expression:
+
+cdef:
+shared_ptr[CExpression] wrapped
+CExpression* expr
+
+def __init__(self, Buffer buffer=None):
+if buffer is not None:
+c_buffer = pyarrow_unwrap_buffer(buffer)
+expr = GetResultValue(CExpression.Deserialize(deref(c_buffer)))
+self.init(expr)
+
+cdef void init(self, const shared_ptr[CExpression]& sp):
+self.wrapped = sp
+self.expr = sp.get()
+
+@staticmethod
+cdef wrap(const shared_ptr[CExpression]& sp):
+self = Expression()
+self.init(sp)
+return self
+
+cdef inline shared_ptr[CExpression] unwrap(self):
+return self.wrapped
+
+def equals(self, Expression other):
+return self.expr.Equals(other.unwrap())
+
+def __str__(self):
+return frombytes(self.expr.ToString())
+
+def __repr__(self):
+return "".format(
+self.__class__.__name__, str(self)
+)
+
+def __reduce__(self):
+buffer = pyarrow_wrap_buffer(GetResultValue(self.expr.Serialize()))
+return Expression, (buffer,)
+
+def validate(self, Schema schema not None):
+"""Validate this expression for execution against a schema.
+
+This will check that all reference fields are present (fields not in
+the schema will be replaced with null) and all subexpressions are
+executable. Returns the type to which this expression will evaluate.
+
+Parameters
+--
+schema : Schema
+Schema to execute the expression on.
+
+Returns
+---
+type : DataType
+"""
+cdef:
+shared_ptr[CSchema] sp_schema
+CResult[shared_ptr[CDataType]] result
+sp_schema = pyarrow_unwrap_schema(schema)
+result = self.expr.Validate(deref(sp_schema))
+return pyarrow_wrap_data_type(GetResultValue(result))
+
+def assume(self, Expression given):
+"""Simplify to an equivalent Expression given assumed constraints."""
+return Expression.wrap(self.expr.Assume(given.unwrap()))
+
+def __invert__(self):
+return Expression.wrap(CMakeNotExpression(self.unwrap()))
+
+@staticmethod
+cdef shared_ptr[CExpression] _expr_or_scalar(object expr) except *:
+if isinstance(expr, Expression):
+return ( expr).unwrap()
+return ( Expression.scalar(expr)).unwrap()
+
+@staticmethod
+def wtf():
+return Expression.wrap(Expression._expr_or_scalar([]))

Review comment:
   yep





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-27 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r416092830



##
File path: python/pyarrow/_dataset.pyx
##
@@ -269,20 +454,21 @@ cdef class FileSystemDataset(Dataset):
 cdef:
 CFileSystemDataset* filesystem_dataset
 
-def __init__(self, paths_or_selector, schema=None, format=None,
- filesystem=None, partitions=None, root_partition=None):
+def __init__(self, paths_or_selector, schema, format, filesystem,
+ partitions=None, Expression root_partition=_true):

Review comment:
   reverted





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-23 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r414184803



##
File path: r/src/expression.cpp
##
@@ -21,99 +21,97 @@
 
 // [[arrow::export]]
 std::shared_ptr dataset___expr__field_ref(std::string name) {
-  return std::make_shared(std::move(name));
+  return ds::field_ref(std::move(name));
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__equal(
+std::shared_ptr dataset___expr__equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__not_equal(
+std::shared_ptr dataset___expr__not_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::not_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__greater(
+std::shared_ptr dataset___expr__greater(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::greater(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__greater_equal(
+std::shared_ptr dataset___expr__greater_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::greater_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__less(
+std::shared_ptr dataset___expr__less(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::less(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__less_equal(
+std::shared_ptr dataset___expr__less_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::less_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__in(
+std::shared_ptr dataset___expr__in(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
-  return std::make_shared(lhs->In(rhs));
+  return lhs->In(rhs).Copy();

Review comment:
   `Copy()` moves the expression from the stack into a shared_ptr, 
replacing make_shared.
   
   I can add `ds::in(...)` etc if desired.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-23 Thread GitBox


bkietz commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r414184803



##
File path: r/src/expression.cpp
##
@@ -21,99 +21,97 @@
 
 // [[arrow::export]]
 std::shared_ptr dataset___expr__field_ref(std::string name) {
-  return std::make_shared(std::move(name));
+  return ds::field_ref(std::move(name));
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__equal(
+std::shared_ptr dataset___expr__equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__not_equal(
+std::shared_ptr dataset___expr__not_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::not_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__greater(
+std::shared_ptr dataset___expr__greater(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::greater(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__greater_equal(
+std::shared_ptr dataset___expr__greater_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::greater_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__less(
+std::shared_ptr dataset___expr__less(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::less(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__less_equal(
+std::shared_ptr dataset___expr__less_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::less_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__in(
+std::shared_ptr dataset___expr__in(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
-  return std::make_shared(lhs->In(rhs));
+  return lhs->In(rhs).Copy();

Review comment:
   `Copy()` moves the expression from the stack into a shared_ptr, 
replacing make_shared.
   
   I can add them if desired.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org