[GitHub] [arrow] bkietz commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


bkietz commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491639719



##
File path: python/pyarrow/array.pxi
##
@@ -158,24 +158,44 @@ def array(object obj, type=None, mask=None, size=None, 
from_pandas=None,
 Notes
 -
 Localized timestamps will currently be returned as UTC (pandas's native
-representation).  Timezone-naive data will be implicitly interpreted as
+representation). Timezone-naive data will be implicitly interpreted as
 UTC.
 
+Converting to dictionary array will promote to a wider integer type for
+indices if the number of distinct values cannot be represented, even if
+the index type was explicitly set. This means that if there are more than
+127 values the returned dictionary array's index type will be at least
+pa.int16() even if pa.int8() was passed to the function. Note that an
+explicit index type will not be demoted even if it is wider than required.
+
 Examples
 
 >>> import pandas as pd
 >>> import pyarrow as pa
 >>> pa.array(pd.Series([1, 2]))
-
+
 [
   1,
   2
 ]
 
+>>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string()))
+
+-- dictionary:
+[
+  "a",
+  "b"
+]
+-- indices:
+[
+  0,
+  1,
+  0
+]
+

Review comment:
   Would it be worthwhile to add an example of index type promotion?
   ```suggestion
   >>> pa.array(range(1024), type=pa.dictionary(pa.int8(), 
pa.int64()).type.index_type
   DataType(int16)
   ```

##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,348 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  virtual ~Converter() = default;
+
+  virtual Status Initialize(std::shared_ptr type,
+std::shared_ptr builder,
+const std::vector>& children,
+OptionsType options) {
+type_ = std::move(type);
+builder_ = std::move(builder);
+children_ = std::move(children);
+options_ = std::move(options);
+return Init();
+  }
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }
+
+  const std::shared_ptr& builder() const { return builder_; }
+
+  const std::shared_ptr& type() const { return type_; }
+
+  OptionsType options() const { return options_; }
+
+  const std::vector>& children() const { return 
children_; }
+
+  Status Reserve(int64_t additional_capacity) {
+return builder_->Reserve(additional_capacity);
+  }
+
+  Status AppendNull() { return builder_->AppendNull(); }
+
+  virtual Result> ToArray() { return 
builder_->Finish(); }
+
+  virtual Result> ToArray(int64_t length) {
+ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray());
+return arr->Slice(0, length);
+  }
+
+ protected:
+  std::shared_ptr type_;
+  std::shared_ptr builder_;
+  std::vector> children_;
+  OptionsType options_;
+};
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {

Review comment:
   ```suggestion
   template 
   class ListConverter : public BaseConverter {
   ```

##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,348 @@
+// Licensed to the

[GitHub] [arrow] emkornfield commented on pull request #8219: ARROW-9603: Fix parquet write

2020-09-19 Thread GitBox


emkornfield commented on pull request #8219:
URL: https://github.com/apache/arrow/pull/8219#issuecomment-695392925


   Nm, I think this is likely the only reasonable approach.  We might consider 
pushing bitmap building up the stack at some point.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #8224: ARROW-10044: [Rust] Improved Arrow's README.

2020-09-19 Thread GitBox


github-actions[bot] commented on pull request #8224:
URL: https://github.com/apache/arrow/pull/8224#issuecomment-695361993


   https://issues.apache.org/jira/browse/ARROW-10044



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on pull request #8219: ARROW-9603: Fix parquet write

2020-09-19 Thread GitBox


emkornfield commented on pull request #8219:
URL: https://github.com/apache/arrow/pull/8219#issuecomment-695361437


   There is a better solution.  I'll update the PR



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] jorgecarleitao opened a new pull request #8224: ARROW-10044: [Rust] Improved Arrow's README.

2020-09-19 Thread GitBox


jorgecarleitao opened a new pull request #8224:
URL: https://github.com/apache/arrow/pull/8224


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] jorgecarleitao commented on pull request #8172: ARROW-9937: [Rust] [DataFusion] Improved aggregations

2020-09-19 Thread GitBox


jorgecarleitao commented on pull request #8172:
URL: https://github.com/apache/arrow/pull/8172#issuecomment-695358864


   @alamb , I tried to take the ScalarValue out of the PR for over 2hs, but I 
was unable: there is too much dependency on it on the current accumulators, and 
it would be a lot of new code that would then be again removed by this PR.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me commented on pull request #8221: ARROW-9338: [Rust] Add clippy instructions

2020-09-19 Thread GitBox


nevi-me commented on pull request #8221:
URL: https://github.com/apache/arrow/pull/8221#issuecomment-695351599


   I'm unable to merge, something wrong with my local setup. @andygrove may you 
please merge if you see this message



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] trxcllnt commented on pull request #2035: ARROW-2116: [JS] implement IPC writers

2020-09-19 Thread GitBox


trxcllnt commented on pull request #2035:
URL: https://github.com/apache/arrow/pull/2035#issuecomment-695349165


   > @t829702 if JS is not the way to interact with Arrow, then what is the 
purpose of JS implementation? Is the JS implementation supposed to be read-only 
uses?
   
   My comment was about the Arrow JSON IPC _representation_. The Arrow JS 
library is a valid way to interact with Arrow data and stream.
   
   We have a special JSON IPC reprepsentation to run integration tests between 
the different Arrow language implementations. The `RecordBatchJSONReader` and 
`RecordBatchJSONWriter` classes are used to read/write this format, but this 
format is designed for reading and writing by humans, and would be very 
inefficient for real-world use.
   
   > can different columns share a single Dictionary (to save some space)
   
   This isn't prohibited in the Arrow IPC format, but this is not supported by 
the current `DictionaryBuilder` implementation. That would be a good feature to 
add in [JIRA](https://issues.apache.org/jira/projects/ARROW).
   
   > is there a better arrow data type for JavaScript Date's JSON string format?
   
   No, that's the correct API for the Date data types. Providing a separate 
utility in Arrow to parse dates would simply be duplicating the (relatively 
well optimized) Date implementations provided by the JS VM.
   
   If you want to parse the string directly into a numeric timestamp without 
using Date, you could use one of the Arrow `Timestamp` data types instead of a 
Date type.
   
   > Is there a better way to create RecordBatch than the static method 
`arrow.RecordBatch.new`?
   
   No, this is the recommended way to construct a RecordBatch zero-copy.
   
   > it runs in 2.3s convert a 50MB line-json file to a 21MB arrow file, not 
too bad for each single file, most of my dataset source files are less than 
100MB, but there are huge number of them, what's the better (optimized and 
ergonomic way) to convert them?
   
   You may have success using a newline-delimited JSON parser like 
[ndjson](https://www.npmjs.com/package/ndjson) rather than JSON.parse. You can 
also pass a custom `dictionaryHashFunction` to the DictionaryBuilder; I opted 
for the ultra-fast [node metrohash](https://www.npmjs.com/package/metrohash) 
implementation in the [csv-to-arrow 
example](https://github.com/trxcllnt/csv-to-arrow-js/blob/f2596045474ce1742e3089da48a5c83a6005be90/index.js#L61).
   
   The easiest approach to speeding this up is to run your script in parallel 
on as many cores as you can afford, or use a different language implementation. 
If you have a GPU available, you can use 
[cuDF](https://docs.rapids.ai/api/cudf/stable/) to read newline-delimited JSON 
as a cuDF DataFrame, then serialize the DataFrame to disk as an Arrow table.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #8223: ARROW-10040: [Rust] Add slice that realigns Buffer

2020-09-19 Thread GitBox


github-actions[bot] commented on pull request #8223:
URL: https://github.com/apache/arrow/pull/8223#issuecomment-695347870


   https://issues.apache.org/jira/browse/ARROW-10040



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me commented on pull request #8223: ARROW-10040: [Rust] Add slice that realigns Buffer

2020-09-19 Thread GitBox


nevi-me commented on pull request #8223:
URL: https://github.com/apache/arrow/pull/8223#issuecomment-695347849


   @jhorstmann @paddyhoran this is related to the alignment fixes made 
recently. I noticed while reviewing another PR that we had a limitation on 
boolean kernels if offsets weren't a multiple of 8. So I've implemented a slice 
on `Buffer` that copies the buffer and removes the offsets.
   
   I expect there to be a minor perf impact, and I only use the above slice 
method when necessary.
   
   It's very likely that my implementation can be improved upon, but I'm at my 
current limits on bit manipulation. Please see https://godbolt.org/z/bMc5qd for 
some of the assembly generated. Not happy with the instruction count, and would 
appreciate some guidance or improvements. Feel free to push any changes 
directly on this branch.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me opened a new pull request #8223: ARROW-10040: [Rust] Add slice that realigns Buffer

2020-09-19 Thread GitBox


nevi-me opened a new pull request #8223:
URL: https://github.com/apache/arrow/pull/8223


   Has the consequence of removing the alignment limit on bool kernels.
   It however comes at the cost of slower buffer manipulation.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #8222: ARROW-10043: [Rust][DataFusion] Implement COUNT(DISTINCT col)

2020-09-19 Thread GitBox


github-actions[bot] commented on pull request #8222:
URL: https://github.com/apache/arrow/pull/8222#issuecomment-695345313


   https://issues.apache.org/jira/browse/ARROW-10043



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] drusso opened a new pull request #8222: ARROW-10043: [Rust][DataFusion] Implement COUNT(DISTINCT col)

2020-09-19 Thread GitBox


drusso opened a new pull request #8222:
URL: https://github.com/apache/arrow/pull/8222


   This is a proposal for an initial and partial implementation of the 
`DISTINCT` keyword. Only `COUNT(DISTINCT)` is supported, with the following 
conditions:
   
   (a) only one argument, i.e. `COUNT(DISTINCT col)`, but not `COUNT(DISTINCT 
col, other)`,
   (b) the argument is an integer type, and
   (c) the query must have a `GROUP BY` clause.
   
   **Implementation Overview:**
   
   The `Expr::AggregateFunction` variant has a new field, `distinct`, which 
mirrors the `distinct` flag from `SQLExpr::Function` (up until now this flag 
was unused). Any `Expr::AggregateFunction` may have its `distinct` flag 
switched to `true` if the keyword is present in the SQL query. However, the 
physical planner respects it only for `COUNT` expressions.
   
   The count distinct aggregation slots into the existing physical plans as a 
new set of `AggregateExpr`. To demonstrate, below are examples of the physical 
plans for the following query, where `c1` may be any data type, and `c2` is a 
`UInt8` column:
   
   ```
   SELECT c1, COUNT(DISTINCT c2) FROM t1 GROUP BY c1
   ```
   
   (a) Multiple Partitions:
   
   HashAggregateExec:
 mode: Final
 group_expr:
   Column(c1)
 aggr_expr:
   DistinctCountReduce(Column(c2))
 schema:
   c1: any
   c2: UInt64
 input:
   MergeExec:
 input:
   HashAggregateExec:
 mode: Partial
 group_expr:
   Column(c1)
 aggr_expr:
   DistinctCount(Column(c2))
 schema:
   c1: any
   c2: LargeList(UInt8)
 input:
   CsvExec:
 schema:
   c1: any
   c2: UInt8
   
   The `DistinctCount` accumulates each `UInt8` into a list of distinct 
`UInt8`. No counts are collected yet, this is a partial result: lists of 
distinct values. In the `RecordBatch`, this is a `LargeListArray` 
column. After the `MergeExec`, each list in `LargeListArray` is 
accumulated by `DistinctCountReduce` (via `accumulate_batch()`), producing the 
_final_ sets of distinct values. Finally, given the finalized sets of distinct 
values, the counts are computed (always as `UInt64`).
   
   (b) Single Partition:
   
   HashAggregateExec:
 mode: NoPartial
 group_expr:
   Column(c1)
 aggr_expr:
   DistinctCountReduce(Column(c2))
 schema:
   c1: any
   c2: UInt64
 input:
   CsvExec:
 schema:
   c1: any
   c2: UInt8
   
   This scenario is unlike the multiple partition scenario: `DistinctCount` is 
_not_ used, and there are no partial sets of distinct values. Rather, in a 
single `HashAggregateExec` stage, each `UInt8` is accumulated into a distinct 
value set, then the counts are computed at the end of the stage. 
`DistinctCountReduce` is used, but note that unlike the multiple partition 
case, it accumulates scalars via `accumulate_scalar()`.
   
   There is a new aggregation mode: `NoPartial`. In summary, the modes are:
   
   - `NoPartial`: used in single-stage aggregations
   - `Partial`: used as the first stage of two-stage aggregations
   - `Final`: used as the second stage of two-stage aggregaions
   
   Prior to the new `NoPartial` mode, `Partial` was handling both of what are 
now the responsibilities of `Partial` and `NoPartial`. No distinction was 
required, because _non-distinct_ aggregations (such as count, sum, min, max, 
and avg) do not need the distinction: the first aggregation stage is always the 
same, regardless of whether the aggregation is one-stage or two-stage. This is 
not the case for a _distinct_ count aggregation, and we can see that in the 
physical plans above.
   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] wjones1 commented on pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader

2020-09-19 Thread GitBox


wjones1 commented on pull request #6979:
URL: https://github.com/apache/arrow/pull/6979#issuecomment-695343480


   I'm back on this for the weekend and will be back as needed the week after 
next.
   
   @jorisvandenbossche I can confirm that once I merge in the latest changes 
from apache master, I am getting the batches to be the expected size (and 
spanning across the parquet chunks). I have updated the 
`test_iter_batches_columns_reader` unit test accordingly.
   
   However, I have found that if the columns selected include a categorical 
column, then it reverts back to the behavior I was seeing before. You should be 
able to reproduce this by editing line 178 of the parquet tests to include 
`categorical=True`, and will find that the `test 
test_iter_batches_columns_reader` will then fail. I'm fine leaving this fix for 
a future story.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader

2020-09-19 Thread GitBox


wjones1 commented on a change in pull request #6979:
URL: https://github.com/apache/arrow/pull/6979#discussion_r491481801



##
File path: python/pyarrow/parquet.py
##
@@ -301,6 +301,45 @@ def read_row_groups(self, row_groups, columns=None, 
use_threads=True,
column_indices=column_indices,
use_threads=use_threads)
 
+def iter_batches(self, batch_size=65536, row_groups=None, columns=None,
+ use_threads=True, use_pandas_metadata=False):
+"""
+Read streaming batches from a Parquet file
+
+Parameters
+--
+batch_size: int, default 64K
+Maximum number of records to yield per batch. Batches may be
+smaller if there aren't enough rows in a rowgroup.
+row_groups: list
+Only these row groups will be read from the file.
+columns: list
+If not None, only these columns will be read from the file. A
+column name may be a prefix of a nested field, e.g. 'a' will select
+'a.b', 'a.c', and 'a.d.e'
+use_threads : boolean, default True
+Perform multi-threaded column reads
+use_pandas_metadata : boolean, default False
+If True and file has custom pandas schema metadata, ensure that
+index columns are also loaded
+
+Yields

Review comment:
   Got it. Also realized that technically it's yielding RecordBatches, not 
Tables. So changing to `iterator of pyarrow.RecordBatch`.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] arw2019 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader

2020-09-19 Thread GitBox


arw2019 commented on a change in pull request #6979:
URL: https://github.com/apache/arrow/pull/6979#discussion_r491479021



##
File path: python/pyarrow/parquet.py
##
@@ -301,6 +301,45 @@ def read_row_groups(self, row_groups, columns=None, 
use_threads=True,
column_indices=column_indices,
use_threads=use_threads)
 
+def iter_batches(self, batch_size=65536, row_groups=None, columns=None,
+ use_threads=True, use_pandas_metadata=False):
+"""
+Read streaming batches from a Parquet file
+
+Parameters
+--
+batch_size: int, default 64K
+Maximum number of records to yield per batch. Batches may be
+smaller if there aren't enough rows in a rowgroup.
+row_groups: list
+Only these row groups will be read from the file.
+columns: list
+If not None, only these columns will be read from the file. A
+column name may be a prefix of a nested field, e.g. 'a' will select
+'a.b', 'a.c', and 'a.d.e'
+use_threads : boolean, default True
+Perform multi-threaded column reads
+use_pandas_metadata : boolean, default False
+If True and file has custom pandas schema metadata, ensure that
+index columns are also loaded
+
+Yields

Review comment:
   Should be `Returns`

##
File path: python/pyarrow/parquet.py
##
@@ -301,6 +301,45 @@ def read_row_groups(self, row_groups, columns=None, 
use_threads=True,
column_indices=column_indices,
use_threads=use_threads)
 
+def iter_batches(self, batch_size=65536, row_groups=None, columns=None,
+ use_threads=True, use_pandas_metadata=False):
+"""
+Read streaming batches from a Parquet file
+
+Parameters
+--
+batch_size: int, default 64K
+Maximum number of records to yield per batch. Batches may be
+smaller if there aren't enough rows in a rowgroup.
+row_groups: list
+Only these row groups will be read from the file.
+columns: list
+If not None, only these columns will be read from the file. A
+column name may be a prefix of a nested field, e.g. 'a' will select
+'a.b', 'a.c', and 'a.d.e'
+use_threads : boolean, default True
+Perform multi-threaded column reads
+use_pandas_metadata : boolean, default False
+If True and file has custom pandas schema metadata, ensure that
+index columns are also loaded
+
+Yields
+--
+pyarrow.table.Table
+Content of each batch as a table (of columns)

Review comment:
   `Contents`





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] jorgecarleitao commented on pull request #8118: ARROW-9922: [Rust] Add StructArray::TryFrom (+40%)

2020-09-19 Thread GitBox


jorgecarleitao commented on pull request #8118:
URL: https://github.com/apache/arrow/pull/8118#issuecomment-695335519


   @nevi-me and @andygrove , I reverted the change wrt to the builder, so that 
this is an additive PR.
   
   @andygrove, wrt to the dynamically building the array, note that a 
StructArray is almost only composed by child data: the struct itself is a null 
bitmap and some pointers. Therefore, the cost of building a Struct will always 
be driven by the allocation of those buffers.
   
   With that said, you are right that during the creation of the fields, the 
benchmark clones the arrays, while a builder will build them on the fly and 
thus reduce memory footprint.
   
   IMO that issue is separated from the creation of the struct itself (but 
related to the build of its childs): it is how we efficiently build non-struct 
arrays without first allocating vectors, that the builders aimed at solving. I 
am outlying some of this on #8211, which allows to build primitive Arrays from 
an iterator without exposing a unsafe API to users and would avoid the double 
allocation that you refer to.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


bkietz commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491397449



##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+list_type_ = checked_cast(this->type_.get());
+list_builder_ = checked_cast(this->builder_.get());
+value_converter_ = this->children_[0];
+return Status::OK();
+  }
+
+ protected:
+  const T* list_type_;
+  BuilderType* list_builder_;
+  std::shared_ptr value_converter_;
+};
+
+template 
+class StructConverter : public BaseConverter {
+ public:
+  Status Init() override {
+struct_type_ = checked_cast(this->type_.get());
+struct_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+};
+
+template 
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder;
+
+  Status Init() override {
+dict_type_ = checked_cast(this->type_.get());
+value_type_ = checked_cast(dict_type_->value_type().get());
+value_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const DictionaryType* dict_type_;
+  const U* value_type_;
+  BuilderType* value_builder_;
+};
+
+template 
+struct MakeConverterImpl;
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  template 
+  using Primitive = PrimitiveConverter;
+  template 
+  using List = ListConverter;
+  template 
+  using Dictionary = DictionaryConverter;
+  using Struct = StructConverter;
+
+  static Result> Make(std::shared_ptr type,
+MemoryPool* pool, OptionsType 
options) {
+std::shared_ptr out;
+MakeConverterImpl visitor = {type, pool, options, &out};
+ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor));
+ARROW_RETURN_NOT_OK(out->Init());
+return out;
+  }
+
+  virtual ~Converter() = default;
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }

Review comment:
   Implementations can still choose to return notimplemented for some 
types, so I think the choice to do so should be left to them





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491386279



##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+list_type_ = checked_cast(this->type_.get());
+list_builder_ = checked_cast(this->builder_.get());
+value_converter_ = this->children_[0];
+return Status::OK();
+  }
+
+ protected:
+  const T* list_type_;
+  BuilderType* list_builder_;
+  std::shared_ptr value_converter_;
+};
+
+template 
+class StructConverter : public BaseConverter {
+ public:
+  Status Init() override {
+struct_type_ = checked_cast(this->type_.get());
+struct_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+};
+
+template 
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder;
+
+  Status Init() override {
+dict_type_ = checked_cast(this->type_.get());
+value_type_ = checked_cast(dict_type_->value_type().get());
+value_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const DictionaryType* dict_type_;
+  const U* value_type_;
+  BuilderType* value_builder_;
+};
+
+template 
+struct MakeConverterImpl;
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  template 
+  using Primitive = PrimitiveConverter;
+  template 
+  using List = ListConverter;
+  template 
+  using Dictionary = DictionaryConverter;
+  using Struct = StructConverter;
+
+  static Result> Make(std::shared_ptr type,
+MemoryPool* pool, OptionsType 
options) {
+std::shared_ptr out;
+MakeConverterImpl visitor = {type, pool, options, &out};
+ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor));
+ARROW_RETURN_NOT_OK(out->Init());
+return out;
+  }
+
+  virtual ~Converter() = default;
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }
+
+  const std::shared_ptr& builder() const { return builder_; }
+
+  const std::shared_ptr& type() const { return type_; }
+
+  OptionsType options() const { return options_; }
+
+  const std::vector> children() const { return 
children_; }
+
+  virtual Status Reserve(int64_t additional_capacity) {
+return builder_->Reserve(additional_capacity);
+  }
+
+  virtual Status AppendNull() { return builder_->AppendNull(); }
+
+  virtual Result> ToArray() { return 
builder_->Finish(); }
+
+  virtual Result> ToArray(int64_t length) {
+ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray());
+return arr->Slice(0, length);
+  }
+
+ protected:
+  friend struct MakeConverterImpl;
+
+  std::shared_ptr type_;
+  std::shared_ptr builder_;
+  std::vector> children_;
+  OptionsType options_;
+};
+
+#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS)  \
+  case Type::TYPE_ENUM: \
+return Finish>( \
+std::move(

[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491386599



##
File path: cpp/src/arrow/python/python_to_arrow.cc
##
@@ -329,186 +300,106 @@ struct ValueConverter {
 default:
   return Status::UnknownError("Invalid time unit");
   }
+} else if (PyArray_CheckAnyScalarExact(obj)) {
+  // validate that the numpy scalar has np.datetime64 dtype
+  std::shared_ptr numpy_type;
+  RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), 
&numpy_type));
+  if (!numpy_type->Equals(*type)) {
+return Status::NotImplemented("Expected np.timedelta64 but got: ",
+  numpy_type->ToString());
+  }
+  return reinterpret_cast(obj)->obval;
 } else {
   RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
 }
 return value;
   }
 
-  static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) {
-// validate that the numpy scalar has np.timedelta64 dtype
-std::shared_ptr type;
-RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
-if (type->id() != DurationType::type_id) {
-  // TODO(kszucs): the message should highlight the received numpy dtype
-  return Status::Invalid("Expected np.timedelta64 but got: ", 
type->ToString());
-}
-// validate that the time units are matching
-if (unit != checked_cast(*type).unit()) {
-  return Status::NotImplemented(
-  "Cannot convert NumPy np.timedelta64 objects with differing unit");
-}
-// convert the numpy value
-return reinterpret_cast(obj)->obval;
-  }
-};
-
-template 
-struct ValueConverter> {
-  static inline Result FromPython(PyObject* obj) {
-PyBytesView view;
-RETURN_NOT_OK(view.FromString(obj));
-return std::move(view);
-  }
-};
+  // The binary-like intermediate representation is PyBytesView because it 
keeps temporary
+  // python objects alive (non-contiguous memoryview) and stores whether the 
original
+  // object was unicode encoded or not, which is used for unicode -> bytes 
coersion if
+  // there is a non-unicode object observed.
 
-template 
-struct ValueConverter> {
-  static inline Result FromPython(PyObject* obj) {
-// strict conversion, force output to be unicode / utf8 and validate that
-// any binary values are utf8
-bool is_utf8 = false;
-PyBytesView view;
-
-RETURN_NOT_OK(view.FromString(obj, &is_utf8));
-if (!is_utf8) {
-  return internal::InvalidValue(obj, "was not a utf8 string");
-}
-return std::move(view);
+  static Result Convert(const BaseBinaryType*, const O&, I obj) {
+return PyBytesView::FromString(obj);
   }
 
-  static inline Result FromPython(PyObject* obj, bool* is_utf8) {
-PyBytesView view;
-
-// Non-strict conversion; keep track of whether values are unicode or bytes
-if (PyUnicode_Check(obj)) {
-  *is_utf8 = true;
-  RETURN_NOT_OK(view.FromUnicode(obj));
+  static Result Convert(const FixedSizeBinaryType* type, const 
O&, I obj) {
+ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj));
+if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) {
+  return std::move(view);
 } else {
-  // If not unicode or bytes, FromBinary will error
-  *is_utf8 = false;
-  RETURN_NOT_OK(view.FromBinary(obj));
-}
-return std::move(view);
-  }
-};
-
-template 
-struct ValueConverter> {
-  static inline Result FromPython(PyObject* obj, int32_t 
byte_width) {
-PyBytesView view;
-RETURN_NOT_OK(view.FromString(obj));
-if (ARROW_PREDICT_FALSE(view.size != byte_width)) {
   std::stringstream ss;
-  ss << "expected to be length " << byte_width << " was " << view.size;
+  ss << "expected to be length " << type->byte_width() << " was " << 
view.size;
   return internal::InvalidValue(obj, ss.str());
-} else {
-  return std::move(view);
 }
   }
-};
-
-// --
-// Sequence converter base and CRTP "middle" subclasses
 
-class SeqConverter;
-
-// Forward-declare converter factory
-Status GetConverter(const std::shared_ptr& type, bool from_pandas,
-bool strict_conversions, bool ignore_timezone,
-std::unique_ptr* out);
-
-// Marshal Python sequence (list, tuple, etc.) to Arrow array
-class SeqConverter {
- public:
-  virtual ~SeqConverter() = default;
-
-  // Initialize the sequence converter with an ArrayBuilder created
-  // externally. The reason for this interface is that we have
-  // arrow::MakeBuilder which also creates child builders for nested types, so
-  // we have to pass in the child builders to child SeqConverter in the case of
-  // converting Python objects to Arrow nested types
-  virtual Status Init(ArrayBuilder* builder) = 0;
-
-  // Append a single null value to the builder
-  virtual Status AppendNull() = 0;
-
-  // Append a vali

[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491384129



##
File path: python/pyarrow/includes/libarrow.pxd
##
@@ -969,11 +969,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 vector[shared_ptr[CScalar]] value
 CResult[shared_ptr[CScalar]] field(CFieldRef ref) const
 
-cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar):
-cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType":
-shared_ptr[CScalar] index
-shared_ptr[CArray] dictionary
+cdef cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType":
+shared_ptr[CScalar] index
+shared_ptr[CArray] dictionary
 
+cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar):
+CDictionaryScalar(CDictionaryValue value, shared_ptr[CDataType],
+  c_bool is_valid)
 CDictionaryValue value

Review comment:
   Updated.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491382018



##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+list_type_ = checked_cast(this->type_.get());
+list_builder_ = checked_cast(this->builder_.get());
+value_converter_ = this->children_[0];
+return Status::OK();
+  }
+
+ protected:
+  const T* list_type_;
+  BuilderType* list_builder_;
+  std::shared_ptr value_converter_;
+};
+
+template 
+class StructConverter : public BaseConverter {
+ public:
+  Status Init() override {
+struct_type_ = checked_cast(this->type_.get());
+struct_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+};
+
+template 
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder;
+
+  Status Init() override {
+dict_type_ = checked_cast(this->type_.get());
+value_type_ = checked_cast(dict_type_->value_type().get());
+value_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const DictionaryType* dict_type_;
+  const U* value_type_;
+  BuilderType* value_builder_;
+};
+
+template 
+struct MakeConverterImpl;
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  template 
+  using Primitive = PrimitiveConverter;
+  template 
+  using List = ListConverter;
+  template 
+  using Dictionary = DictionaryConverter;
+  using Struct = StructConverter;
+
+  static Result> Make(std::shared_ptr type,
+MemoryPool* pool, OptionsType 
options) {
+std::shared_ptr out;
+MakeConverterImpl visitor = {type, pool, options, &out};
+ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor));
+ARROW_RETURN_NOT_OK(out->Init());
+return out;
+  }
+
+  virtual ~Converter() = default;
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }

Review comment:
   To not require complete type support from the implementor.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491377495



##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+list_type_ = checked_cast(this->type_.get());
+list_builder_ = checked_cast(this->builder_.get());
+value_converter_ = this->children_[0];
+return Status::OK();
+  }
+
+ protected:
+  const T* list_type_;
+  BuilderType* list_builder_;
+  std::shared_ptr value_converter_;
+};
+
+template 
+class StructConverter : public BaseConverter {
+ public:
+  Status Init() override {
+struct_type_ = checked_cast(this->type_.get());
+struct_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+};
+
+template 
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder;
+
+  Status Init() override {
+dict_type_ = checked_cast(this->type_.get());
+value_type_ = checked_cast(dict_type_->value_type().get());
+value_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const DictionaryType* dict_type_;
+  const U* value_type_;
+  BuilderType* value_builder_;
+};
+
+template 
+struct MakeConverterImpl;
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  template 
+  using Primitive = PrimitiveConverter;
+  template 
+  using List = ListConverter;
+  template 
+  using Dictionary = DictionaryConverter;
+  using Struct = StructConverter;
+
+  static Result> Make(std::shared_ptr type,
+MemoryPool* pool, OptionsType 
options) {
+std::shared_ptr out;
+MakeConverterImpl visitor = {type, pool, options, &out};
+ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor));
+ARROW_RETURN_NOT_OK(out->Init());
+return out;
+  }
+
+  virtual ~Converter() = default;
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }
+
+  const std::shared_ptr& builder() const { return builder_; }
+
+  const std::shared_ptr& type() const { return type_; }
+
+  OptionsType options() const { return options_; }
+
+  const std::vector> children() const { return 
children_; }
+
+  virtual Status Reserve(int64_t additional_capacity) {
+return builder_->Reserve(additional_capacity);
+  }
+
+  virtual Status AppendNull() { return builder_->AppendNull(); }
+
+  virtual Result> ToArray() { return 
builder_->Finish(); }
+
+  virtual Result> ToArray(int64_t length) {
+ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray());
+return arr->Slice(0, length);
+  }
+
+ protected:
+  friend struct MakeConverterImpl;
+
+  std::shared_ptr type_;
+  std::shared_ptr builder_;
+  std::vector> children_;
+  OptionsType options_;
+};
+
+#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS)  \
+  case Type::TYPE_ENUM: \
+return Finish>( \
+std::move(

[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491377151



##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+list_type_ = checked_cast(this->type_.get());
+list_builder_ = checked_cast(this->builder_.get());
+value_converter_ = this->children_[0];
+return Status::OK();
+  }
+
+ protected:
+  const T* list_type_;
+  BuilderType* list_builder_;
+  std::shared_ptr value_converter_;
+};
+
+template 
+class StructConverter : public BaseConverter {
+ public:
+  Status Init() override {
+struct_type_ = checked_cast(this->type_.get());
+struct_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+};
+
+template 
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder;
+
+  Status Init() override {
+dict_type_ = checked_cast(this->type_.get());
+value_type_ = checked_cast(dict_type_->value_type().get());
+value_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const DictionaryType* dict_type_;
+  const U* value_type_;
+  BuilderType* value_builder_;
+};
+
+template 
+struct MakeConverterImpl;
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  template 
+  using Primitive = PrimitiveConverter;
+  template 
+  using List = ListConverter;
+  template 
+  using Dictionary = DictionaryConverter;
+  using Struct = StructConverter;
+
+  static Result> Make(std::shared_ptr type,
+MemoryPool* pool, OptionsType 
options) {
+std::shared_ptr out;
+MakeConverterImpl visitor = {type, pool, options, &out};
+ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor));
+ARROW_RETURN_NOT_OK(out->Init());
+return out;
+  }
+
+  virtual ~Converter() = default;
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }
+
+  const std::shared_ptr& builder() const { return builder_; }
+
+  const std::shared_ptr& type() const { return type_; }
+
+  OptionsType options() const { return options_; }
+
+  const std::vector> children() const { return 
children_; }
+
+  virtual Status Reserve(int64_t additional_capacity) {
+return builder_->Reserve(additional_capacity);
+  }
+
+  virtual Status AppendNull() { return builder_->AppendNull(); }

Review comment:
   Updated.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491376858



##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+list_type_ = checked_cast(this->type_.get());
+list_builder_ = checked_cast(this->builder_.get());
+value_converter_ = this->children_[0];
+return Status::OK();
+  }
+
+ protected:
+  const T* list_type_;
+  BuilderType* list_builder_;
+  std::shared_ptr value_converter_;
+};
+
+template 
+class StructConverter : public BaseConverter {
+ public:
+  Status Init() override {
+struct_type_ = checked_cast(this->type_.get());
+struct_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+};
+
+template 
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder;
+
+  Status Init() override {
+dict_type_ = checked_cast(this->type_.get());
+value_type_ = checked_cast(dict_type_->value_type().get());
+value_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const DictionaryType* dict_type_;
+  const U* value_type_;
+  BuilderType* value_builder_;
+};
+
+template 
+struct MakeConverterImpl;
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  template 
+  using Primitive = PrimitiveConverter;
+  template 
+  using List = ListConverter;
+  template 
+  using Dictionary = DictionaryConverter;
+  using Struct = StructConverter;
+
+  static Result> Make(std::shared_ptr type,
+MemoryPool* pool, OptionsType 
options) {
+std::shared_ptr out;
+MakeConverterImpl visitor = {type, pool, options, &out};
+ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor));
+ARROW_RETURN_NOT_OK(out->Init());
+return out;
+  }
+
+  virtual ~Converter() = default;
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }
+
+  const std::shared_ptr& builder() const { return builder_; }
+
+  const std::shared_ptr& type() const { return type_; }
+
+  OptionsType options() const { return options_; }
+
+  const std::vector> children() const { return 
children_; }
+
+  virtual Status Reserve(int64_t additional_capacity) {

Review comment:
   Removed.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491368480



##
File path: cpp/src/arrow/util/converter.h
##
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+template 
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+primitive_type_ = checked_cast(this->type_.get());
+primitive_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const T* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template 
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits::BuilderType;
+
+  Status Init() override {
+list_type_ = checked_cast(this->type_.get());
+list_builder_ = checked_cast(this->builder_.get());
+value_converter_ = this->children_[0];
+return Status::OK();
+  }
+
+ protected:
+  const T* list_type_;
+  BuilderType* list_builder_;
+  std::shared_ptr value_converter_;
+};
+
+template 
+class StructConverter : public BaseConverter {
+ public:
+  Status Init() override {
+struct_type_ = checked_cast(this->type_.get());
+struct_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+};
+
+template 
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder;
+
+  Status Init() override {
+dict_type_ = checked_cast(this->type_.get());
+value_type_ = checked_cast(dict_type_->value_type().get());
+value_builder_ = checked_cast(this->builder_.get());
+return Status::OK();
+  }
+
+ protected:
+  const DictionaryType* dict_type_;
+  const U* value_type_;
+  BuilderType* value_builder_;
+};
+
+template 
+struct MakeConverterImpl;
+
+template 
+class Converter {
+ public:
+  using InputType = Input;
+  using OptionsType = Options;
+
+  template 
+  using Primitive = PrimitiveConverter;
+  template 
+  using List = ListConverter;
+  template 
+  using Dictionary = DictionaryConverter;
+  using Struct = StructConverter;
+
+  static Result> Make(std::shared_ptr type,
+MemoryPool* pool, OptionsType 
options) {
+std::shared_ptr out;
+MakeConverterImpl visitor = {type, pool, options, &out};
+ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor));
+ARROW_RETURN_NOT_OK(out->Init());
+return out;
+  }
+
+  virtual ~Converter() = default;
+
+  virtual Status Init() { return Status::OK(); }
+
+  virtual Status Append(InputType value) {
+return Status::NotImplemented("Converter not implemented for type ",
+  type()->ToString());
+  }
+
+  const std::shared_ptr& builder() const { return builder_; }
+
+  const std::shared_ptr& type() const { return type_; }
+
+  OptionsType options() const { return options_; }
+
+  const std::vector> children() const { return 
children_; }
+
+  virtual Status Reserve(int64_t additional_capacity) {
+return builder_->Reserve(additional_capacity);
+  }
+
+  virtual Status AppendNull() { return builder_->AppendNull(); }
+
+  virtual Result> ToArray() { return 
builder_->Finish(); }

Review comment:
   It is overridden from one of the python converters to change the type of 
the output array based on observed values.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491309216



##
File path: cpp/src/arrow/python/python_to_arrow.cc
##
@@ -1352,64 +927,40 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, 
PyObject** seq, int64_t* siz
   return Status::OK();
 }
 
-Status ConvertPySequence(PyObject* sequence_source, PyObject* mask,
- const PyConversionOptions& options,
- std::shared_ptr* out) {
+Result> ConvertPySequence(PyObject* obj, 
PyObject* mask,
+const 
PyConversionOptions& opts,
+MemoryPool* pool) {
   PyAcquireGIL lock;
 
   PyObject* seq;
   OwnedRef tmp_seq_nanny;
-
-  std::shared_ptr real_type;
+  PyConversionOptions options = opts;  // copy options struct since we modify 
it below

Review comment:
   Right, updated.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API

2020-09-19 Thread GitBox


kszucs commented on a change in pull request #8088:
URL: https://github.com/apache/arrow/pull/8088#discussion_r491304867



##
File path: cpp/src/arrow/python/python_to_arrow.cc
##
@@ -329,186 +300,106 @@ struct ValueConverter {
 default:
   return Status::UnknownError("Invalid time unit");
   }
+} else if (PyArray_CheckAnyScalarExact(obj)) {
+  // validate that the numpy scalar has np.datetime64 dtype
+  std::shared_ptr numpy_type;
+  RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), 
&numpy_type));
+  if (!numpy_type->Equals(*type)) {
+return Status::NotImplemented("Expected np.timedelta64 but got: ",
+  numpy_type->ToString());
+  }
+  return reinterpret_cast(obj)->obval;
 } else {
   RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
 }
 return value;
   }
 
-  static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) {
-// validate that the numpy scalar has np.timedelta64 dtype
-std::shared_ptr type;
-RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
-if (type->id() != DurationType::type_id) {
-  // TODO(kszucs): the message should highlight the received numpy dtype
-  return Status::Invalid("Expected np.timedelta64 but got: ", 
type->ToString());
-}
-// validate that the time units are matching
-if (unit != checked_cast(*type).unit()) {
-  return Status::NotImplemented(
-  "Cannot convert NumPy np.timedelta64 objects with differing unit");
-}
-// convert the numpy value
-return reinterpret_cast(obj)->obval;
-  }
-};
-
-template 
-struct ValueConverter> {
-  static inline Result FromPython(PyObject* obj) {
-PyBytesView view;
-RETURN_NOT_OK(view.FromString(obj));
-return std::move(view);
-  }
-};
+  // The binary-like intermediate representation is PyBytesView because it 
keeps temporary
+  // python objects alive (non-contiguous memoryview) and stores whether the 
original
+  // object was unicode encoded or not, which is used for unicode -> bytes 
coersion if
+  // there is a non-unicode object observed.
 
-template 
-struct ValueConverter> {
-  static inline Result FromPython(PyObject* obj) {
-// strict conversion, force output to be unicode / utf8 and validate that
-// any binary values are utf8
-bool is_utf8 = false;
-PyBytesView view;
-
-RETURN_NOT_OK(view.FromString(obj, &is_utf8));
-if (!is_utf8) {
-  return internal::InvalidValue(obj, "was not a utf8 string");
-}
-return std::move(view);
+  static Result Convert(const BaseBinaryType*, const O&, I obj) {
+return PyBytesView::FromString(obj);
   }
 
-  static inline Result FromPython(PyObject* obj, bool* is_utf8) {
-PyBytesView view;
-
-// Non-strict conversion; keep track of whether values are unicode or bytes
-if (PyUnicode_Check(obj)) {
-  *is_utf8 = true;
-  RETURN_NOT_OK(view.FromUnicode(obj));
+  static Result Convert(const FixedSizeBinaryType* type, const 
O&, I obj) {
+ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj));
+if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) {
+  return std::move(view);
 } else {
-  // If not unicode or bytes, FromBinary will error
-  *is_utf8 = false;
-  RETURN_NOT_OK(view.FromBinary(obj));
-}
-return std::move(view);
-  }
-};
-
-template 
-struct ValueConverter> {
-  static inline Result FromPython(PyObject* obj, int32_t 
byte_width) {
-PyBytesView view;
-RETURN_NOT_OK(view.FromString(obj));
-if (ARROW_PREDICT_FALSE(view.size != byte_width)) {
   std::stringstream ss;
-  ss << "expected to be length " << byte_width << " was " << view.size;
+  ss << "expected to be length " << type->byte_width() << " was " << 
view.size;
   return internal::InvalidValue(obj, ss.str());
-} else {
-  return std::move(view);
 }
   }
-};
-
-// --
-// Sequence converter base and CRTP "middle" subclasses
 
-class SeqConverter;
-
-// Forward-declare converter factory
-Status GetConverter(const std::shared_ptr& type, bool from_pandas,
-bool strict_conversions, bool ignore_timezone,
-std::unique_ptr* out);
-
-// Marshal Python sequence (list, tuple, etc.) to Arrow array
-class SeqConverter {
- public:
-  virtual ~SeqConverter() = default;
-
-  // Initialize the sequence converter with an ArrayBuilder created
-  // externally. The reason for this interface is that we have
-  // arrow::MakeBuilder which also creates child builders for nested types, so
-  // we have to pass in the child builders to child SeqConverter in the case of
-  // converting Python objects to Arrow nested types
-  virtual Status Init(ArrayBuilder* builder) = 0;
-
-  // Append a single null value to the builder
-  virtual Status AppendNull() = 0;
-
-  // Append a vali