[GitHub] [arrow] bkietz commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
bkietz commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491639719 ## File path: python/pyarrow/array.pxi ## @@ -158,24 +158,44 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Notes - Localized timestamps will currently be returned as UTC (pandas's native -representation). Timezone-naive data will be implicitly interpreted as +representation). Timezone-naive data will be implicitly interpreted as UTC. +Converting to dictionary array will promote to a wider integer type for +indices if the number of distinct values cannot be represented, even if +the index type was explicitly set. This means that if there are more than +127 values the returned dictionary array's index type will be at least +pa.int16() even if pa.int8() was passed to the function. Note that an +explicit index type will not be demoted even if it is wider than required. + Examples >>> import pandas as pd >>> import pyarrow as pa >>> pa.array(pd.Series([1, 2])) - + [ 1, 2 ] +>>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + +-- dictionary: +[ + "a", + "b" +] +-- indices: +[ + 0, + 1, + 0 +] + Review comment: Would it be worthwhile to add an example of index type promotion? ```suggestion >>> pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64()).type.index_type DataType(int16) ``` ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,348 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + virtual ~Converter() = default; + + virtual Status Initialize(std::shared_ptr type, +std::shared_ptr builder, +const std::vector>& children, +OptionsType options) { +type_ = std::move(type); +builder_ = std::move(builder); +children_ = std::move(children); +options_ = std::move(options); +return Init(); + } + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector>& children() const { return children_; } + + Status Reserve(int64_t additional_capacity) { +return builder_->Reserve(additional_capacity); + } + + Status AppendNull() { return builder_->AppendNull(); } + + virtual Result> ToArray() { return builder_->Finish(); } + + virtual Result> ToArray(int64_t length) { +ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); +return arr->Slice(0, length); + } + + protected: + std::shared_ptr type_; + std::shared_ptr builder_; + std::vector> children_; + OptionsType options_; +}; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { Review comment: ```suggestion template class ListConverter : public BaseConverter { ``` ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,348 @@ +// Licensed to the
[GitHub] [arrow] emkornfield commented on pull request #8219: ARROW-9603: Fix parquet write
emkornfield commented on pull request #8219: URL: https://github.com/apache/arrow/pull/8219#issuecomment-695392925 Nm, I think this is likely the only reasonable approach. We might consider pushing bitmap building up the stack at some point. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #8224: ARROW-10044: [Rust] Improved Arrow's README.
github-actions[bot] commented on pull request #8224: URL: https://github.com/apache/arrow/pull/8224#issuecomment-695361993 https://issues.apache.org/jira/browse/ARROW-10044 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on pull request #8219: ARROW-9603: Fix parquet write
emkornfield commented on pull request #8219: URL: https://github.com/apache/arrow/pull/8219#issuecomment-695361437 There is a better solution. I'll update the PR This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorgecarleitao opened a new pull request #8224: ARROW-10044: [Rust] Improved Arrow's README.
jorgecarleitao opened a new pull request #8224: URL: https://github.com/apache/arrow/pull/8224 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorgecarleitao commented on pull request #8172: ARROW-9937: [Rust] [DataFusion] Improved aggregations
jorgecarleitao commented on pull request #8172: URL: https://github.com/apache/arrow/pull/8172#issuecomment-695358864 @alamb , I tried to take the ScalarValue out of the PR for over 2hs, but I was unable: there is too much dependency on it on the current accumulators, and it would be a lot of new code that would then be again removed by this PR. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on pull request #8221: ARROW-9338: [Rust] Add clippy instructions
nevi-me commented on pull request #8221: URL: https://github.com/apache/arrow/pull/8221#issuecomment-695351599 I'm unable to merge, something wrong with my local setup. @andygrove may you please merge if you see this message This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] trxcllnt commented on pull request #2035: ARROW-2116: [JS] implement IPC writers
trxcllnt commented on pull request #2035: URL: https://github.com/apache/arrow/pull/2035#issuecomment-695349165 > @t829702 if JS is not the way to interact with Arrow, then what is the purpose of JS implementation? Is the JS implementation supposed to be read-only uses? My comment was about the Arrow JSON IPC _representation_. The Arrow JS library is a valid way to interact with Arrow data and stream. We have a special JSON IPC reprepsentation to run integration tests between the different Arrow language implementations. The `RecordBatchJSONReader` and `RecordBatchJSONWriter` classes are used to read/write this format, but this format is designed for reading and writing by humans, and would be very inefficient for real-world use. > can different columns share a single Dictionary (to save some space) This isn't prohibited in the Arrow IPC format, but this is not supported by the current `DictionaryBuilder` implementation. That would be a good feature to add in [JIRA](https://issues.apache.org/jira/projects/ARROW). > is there a better arrow data type for JavaScript Date's JSON string format? No, that's the correct API for the Date data types. Providing a separate utility in Arrow to parse dates would simply be duplicating the (relatively well optimized) Date implementations provided by the JS VM. If you want to parse the string directly into a numeric timestamp without using Date, you could use one of the Arrow `Timestamp` data types instead of a Date type. > Is there a better way to create RecordBatch than the static method `arrow.RecordBatch.new`? No, this is the recommended way to construct a RecordBatch zero-copy. > it runs in 2.3s convert a 50MB line-json file to a 21MB arrow file, not too bad for each single file, most of my dataset source files are less than 100MB, but there are huge number of them, what's the better (optimized and ergonomic way) to convert them? You may have success using a newline-delimited JSON parser like [ndjson](https://www.npmjs.com/package/ndjson) rather than JSON.parse. You can also pass a custom `dictionaryHashFunction` to the DictionaryBuilder; I opted for the ultra-fast [node metrohash](https://www.npmjs.com/package/metrohash) implementation in the [csv-to-arrow example](https://github.com/trxcllnt/csv-to-arrow-js/blob/f2596045474ce1742e3089da48a5c83a6005be90/index.js#L61). The easiest approach to speeding this up is to run your script in parallel on as many cores as you can afford, or use a different language implementation. If you have a GPU available, you can use [cuDF](https://docs.rapids.ai/api/cudf/stable/) to read newline-delimited JSON as a cuDF DataFrame, then serialize the DataFrame to disk as an Arrow table. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #8223: ARROW-10040: [Rust] Add slice that realigns Buffer
github-actions[bot] commented on pull request #8223: URL: https://github.com/apache/arrow/pull/8223#issuecomment-695347870 https://issues.apache.org/jira/browse/ARROW-10040 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on pull request #8223: ARROW-10040: [Rust] Add slice that realigns Buffer
nevi-me commented on pull request #8223: URL: https://github.com/apache/arrow/pull/8223#issuecomment-695347849 @jhorstmann @paddyhoran this is related to the alignment fixes made recently. I noticed while reviewing another PR that we had a limitation on boolean kernels if offsets weren't a multiple of 8. So I've implemented a slice on `Buffer` that copies the buffer and removes the offsets. I expect there to be a minor perf impact, and I only use the above slice method when necessary. It's very likely that my implementation can be improved upon, but I'm at my current limits on bit manipulation. Please see https://godbolt.org/z/bMc5qd for some of the assembly generated. Not happy with the instruction count, and would appreciate some guidance or improvements. Feel free to push any changes directly on this branch. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me opened a new pull request #8223: ARROW-10040: [Rust] Add slice that realigns Buffer
nevi-me opened a new pull request #8223: URL: https://github.com/apache/arrow/pull/8223 Has the consequence of removing the alignment limit on bool kernels. It however comes at the cost of slower buffer manipulation. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #8222: ARROW-10043: [Rust][DataFusion] Implement COUNT(DISTINCT col)
github-actions[bot] commented on pull request #8222: URL: https://github.com/apache/arrow/pull/8222#issuecomment-695345313 https://issues.apache.org/jira/browse/ARROW-10043 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] drusso opened a new pull request #8222: ARROW-10043: [Rust][DataFusion] Implement COUNT(DISTINCT col)
drusso opened a new pull request #8222: URL: https://github.com/apache/arrow/pull/8222 This is a proposal for an initial and partial implementation of the `DISTINCT` keyword. Only `COUNT(DISTINCT)` is supported, with the following conditions: (a) only one argument, i.e. `COUNT(DISTINCT col)`, but not `COUNT(DISTINCT col, other)`, (b) the argument is an integer type, and (c) the query must have a `GROUP BY` clause. **Implementation Overview:** The `Expr::AggregateFunction` variant has a new field, `distinct`, which mirrors the `distinct` flag from `SQLExpr::Function` (up until now this flag was unused). Any `Expr::AggregateFunction` may have its `distinct` flag switched to `true` if the keyword is present in the SQL query. However, the physical planner respects it only for `COUNT` expressions. The count distinct aggregation slots into the existing physical plans as a new set of `AggregateExpr`. To demonstrate, below are examples of the physical plans for the following query, where `c1` may be any data type, and `c2` is a `UInt8` column: ``` SELECT c1, COUNT(DISTINCT c2) FROM t1 GROUP BY c1 ``` (a) Multiple Partitions: HashAggregateExec: mode: Final group_expr: Column(c1) aggr_expr: DistinctCountReduce(Column(c2)) schema: c1: any c2: UInt64 input: MergeExec: input: HashAggregateExec: mode: Partial group_expr: Column(c1) aggr_expr: DistinctCount(Column(c2)) schema: c1: any c2: LargeList(UInt8) input: CsvExec: schema: c1: any c2: UInt8 The `DistinctCount` accumulates each `UInt8` into a list of distinct `UInt8`. No counts are collected yet, this is a partial result: lists of distinct values. In the `RecordBatch`, this is a `LargeListArray` column. After the `MergeExec`, each list in `LargeListArray` is accumulated by `DistinctCountReduce` (via `accumulate_batch()`), producing the _final_ sets of distinct values. Finally, given the finalized sets of distinct values, the counts are computed (always as `UInt64`). (b) Single Partition: HashAggregateExec: mode: NoPartial group_expr: Column(c1) aggr_expr: DistinctCountReduce(Column(c2)) schema: c1: any c2: UInt64 input: CsvExec: schema: c1: any c2: UInt8 This scenario is unlike the multiple partition scenario: `DistinctCount` is _not_ used, and there are no partial sets of distinct values. Rather, in a single `HashAggregateExec` stage, each `UInt8` is accumulated into a distinct value set, then the counts are computed at the end of the stage. `DistinctCountReduce` is used, but note that unlike the multiple partition case, it accumulates scalars via `accumulate_scalar()`. There is a new aggregation mode: `NoPartial`. In summary, the modes are: - `NoPartial`: used in single-stage aggregations - `Partial`: used as the first stage of two-stage aggregations - `Final`: used as the second stage of two-stage aggregaions Prior to the new `NoPartial` mode, `Partial` was handling both of what are now the responsibilities of `Partial` and `NoPartial`. No distinction was required, because _non-distinct_ aggregations (such as count, sum, min, max, and avg) do not need the distinction: the first aggregation stage is always the same, regardless of whether the aggregation is one-stage or two-stage. This is not the case for a _distinct_ count aggregation, and we can see that in the physical plans above. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wjones1 commented on pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader
wjones1 commented on pull request #6979: URL: https://github.com/apache/arrow/pull/6979#issuecomment-695343480 I'm back on this for the weekend and will be back as needed the week after next. @jorisvandenbossche I can confirm that once I merge in the latest changes from apache master, I am getting the batches to be the expected size (and spanning across the parquet chunks). I have updated the `test_iter_batches_columns_reader` unit test accordingly. However, I have found that if the columns selected include a categorical column, then it reverts back to the behavior I was seeing before. You should be able to reproduce this by editing line 178 of the parquet tests to include `categorical=True`, and will find that the `test test_iter_batches_columns_reader` will then fail. I'm fine leaving this fix for a future story. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader
wjones1 commented on a change in pull request #6979: URL: https://github.com/apache/arrow/pull/6979#discussion_r491481801 ## File path: python/pyarrow/parquet.py ## @@ -301,6 +301,45 @@ def read_row_groups(self, row_groups, columns=None, use_threads=True, column_indices=column_indices, use_threads=use_threads) +def iter_batches(self, batch_size=65536, row_groups=None, columns=None, + use_threads=True, use_pandas_metadata=False): +""" +Read streaming batches from a Parquet file + +Parameters +-- +batch_size: int, default 64K +Maximum number of records to yield per batch. Batches may be +smaller if there aren't enough rows in a rowgroup. +row_groups: list +Only these row groups will be read from the file. +columns: list +If not None, only these columns will be read from the file. A +column name may be a prefix of a nested field, e.g. 'a' will select +'a.b', 'a.c', and 'a.d.e' +use_threads : boolean, default True +Perform multi-threaded column reads +use_pandas_metadata : boolean, default False +If True and file has custom pandas schema metadata, ensure that +index columns are also loaded + +Yields Review comment: Got it. Also realized that technically it's yielding RecordBatches, not Tables. So changing to `iterator of pyarrow.RecordBatch`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] arw2019 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader
arw2019 commented on a change in pull request #6979: URL: https://github.com/apache/arrow/pull/6979#discussion_r491479021 ## File path: python/pyarrow/parquet.py ## @@ -301,6 +301,45 @@ def read_row_groups(self, row_groups, columns=None, use_threads=True, column_indices=column_indices, use_threads=use_threads) +def iter_batches(self, batch_size=65536, row_groups=None, columns=None, + use_threads=True, use_pandas_metadata=False): +""" +Read streaming batches from a Parquet file + +Parameters +-- +batch_size: int, default 64K +Maximum number of records to yield per batch. Batches may be +smaller if there aren't enough rows in a rowgroup. +row_groups: list +Only these row groups will be read from the file. +columns: list +If not None, only these columns will be read from the file. A +column name may be a prefix of a nested field, e.g. 'a' will select +'a.b', 'a.c', and 'a.d.e' +use_threads : boolean, default True +Perform multi-threaded column reads +use_pandas_metadata : boolean, default False +If True and file has custom pandas schema metadata, ensure that +index columns are also loaded + +Yields Review comment: Should be `Returns` ## File path: python/pyarrow/parquet.py ## @@ -301,6 +301,45 @@ def read_row_groups(self, row_groups, columns=None, use_threads=True, column_indices=column_indices, use_threads=use_threads) +def iter_batches(self, batch_size=65536, row_groups=None, columns=None, + use_threads=True, use_pandas_metadata=False): +""" +Read streaming batches from a Parquet file + +Parameters +-- +batch_size: int, default 64K +Maximum number of records to yield per batch. Batches may be +smaller if there aren't enough rows in a rowgroup. +row_groups: list +Only these row groups will be read from the file. +columns: list +If not None, only these columns will be read from the file. A +column name may be a prefix of a nested field, e.g. 'a' will select +'a.b', 'a.c', and 'a.d.e' +use_threads : boolean, default True +Perform multi-threaded column reads +use_pandas_metadata : boolean, default False +If True and file has custom pandas schema metadata, ensure that +index columns are also loaded + +Yields +-- +pyarrow.table.Table +Content of each batch as a table (of columns) Review comment: `Contents` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorgecarleitao commented on pull request #8118: ARROW-9922: [Rust] Add StructArray::TryFrom (+40%)
jorgecarleitao commented on pull request #8118: URL: https://github.com/apache/arrow/pull/8118#issuecomment-695335519 @nevi-me and @andygrove , I reverted the change wrt to the builder, so that this is an additive PR. @andygrove, wrt to the dynamically building the array, note that a StructArray is almost only composed by child data: the struct itself is a null bitmap and some pointers. Therefore, the cost of building a Struct will always be driven by the allocation of those buffers. With that said, you are right that during the creation of the fields, the benchmark clones the arrays, while a builder will build them on the fly and thus reduce memory footprint. IMO that issue is separated from the creation of the struct itself (but related to the build of its childs): it is how we efficiently build non-struct arrays without first allocating vectors, that the builders aimed at solving. I am outlying some of this on #8211, which allows to build primitive Arrays from an iterator without exposing a unsafe API to users and would avoid the double allocation that you refer to. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] bkietz commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
bkietz commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491397449 ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +list_type_ = checked_cast(this->type_.get()); +list_builder_ = checked_cast(this->builder_.get()); +value_converter_ = this->children_[0]; +return Status::OK(); + } + + protected: + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; +}; + +template +class StructConverter : public BaseConverter { + public: + Status Init() override { +struct_type_ = checked_cast(this->type_.get()); +struct_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init() override { +dict_type_ = checked_cast(this->type_.get()); +value_type_ = checked_cast(dict_type_->value_type().get()); +value_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template +struct MakeConverterImpl; + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + template + using Primitive = PrimitiveConverter; + template + using List = ListConverter; + template + using Dictionary = DictionaryConverter; + using Struct = StructConverter; + + static Result> Make(std::shared_ptr type, +MemoryPool* pool, OptionsType options) { +std::shared_ptr out; +MakeConverterImpl visitor = {type, pool, options, &out}; +ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); +ARROW_RETURN_NOT_OK(out->Init()); +return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } Review comment: Implementations can still choose to return notimplemented for some types, so I think the choice to do so should be left to them This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491386279 ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +list_type_ = checked_cast(this->type_.get()); +list_builder_ = checked_cast(this->builder_.get()); +value_converter_ = this->children_[0]; +return Status::OK(); + } + + protected: + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; +}; + +template +class StructConverter : public BaseConverter { + public: + Status Init() override { +struct_type_ = checked_cast(this->type_.get()); +struct_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init() override { +dict_type_ = checked_cast(this->type_.get()); +value_type_ = checked_cast(dict_type_->value_type().get()); +value_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template +struct MakeConverterImpl; + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + template + using Primitive = PrimitiveConverter; + template + using List = ListConverter; + template + using Dictionary = DictionaryConverter; + using Struct = StructConverter; + + static Result> Make(std::shared_ptr type, +MemoryPool* pool, OptionsType options) { +std::shared_ptr out; +MakeConverterImpl visitor = {type, pool, options, &out}; +ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); +ARROW_RETURN_NOT_OK(out->Init()); +return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector> children() const { return children_; } + + virtual Status Reserve(int64_t additional_capacity) { +return builder_->Reserve(additional_capacity); + } + + virtual Status AppendNull() { return builder_->AppendNull(); } + + virtual Result> ToArray() { return builder_->Finish(); } + + virtual Result> ToArray(int64_t length) { +ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); +return arr->Slice(0, length); + } + + protected: + friend struct MakeConverterImpl; + + std::shared_ptr type_; + std::shared_ptr builder_; + std::vector> children_; + OptionsType options_; +}; + +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ +return Finish>( \ +std::move(
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491386599 ## File path: cpp/src/arrow/python/python_to_arrow.cc ## @@ -329,186 +300,106 @@ struct ValueConverter { default: return Status::UnknownError("Invalid time unit"); } +} else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { +return Status::NotImplemented("Expected np.timedelta64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) { -// validate that the numpy scalar has np.timedelta64 dtype -std::shared_ptr type; -RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); -if (type->id() != DurationType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString()); -} -// validate that the time units are matching -if (unit != checked_cast(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.timedelta64 objects with differing unit"); -} -// convert the numpy value -return reinterpret_cast(obj)->obval; - } -}; - -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { -PyBytesView view; -RETURN_NOT_OK(view.FromString(obj)); -return std::move(view); - } -}; + // The binary-like intermediate representation is PyBytesView because it keeps temporary + // python objects alive (non-contiguous memoryview) and stores whether the original + // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // there is a non-unicode object observed. -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { -// strict conversion, force output to be unicode / utf8 and validate that -// any binary values are utf8 -bool is_utf8 = false; -PyBytesView view; - -RETURN_NOT_OK(view.FromString(obj, &is_utf8)); -if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); -} -return std::move(view); + static Result Convert(const BaseBinaryType*, const O&, I obj) { +return PyBytesView::FromString(obj); } - static inline Result FromPython(PyObject* obj, bool* is_utf8) { -PyBytesView view; - -// Non-strict conversion; keep track of whether values are unicode or bytes -if (PyUnicode_Check(obj)) { - *is_utf8 = true; - RETURN_NOT_OK(view.FromUnicode(obj)); + static Result Convert(const FixedSizeBinaryType* type, const O&, I obj) { +ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); +if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) { + return std::move(view); } else { - // If not unicode or bytes, FromBinary will error - *is_utf8 = false; - RETURN_NOT_OK(view.FromBinary(obj)); -} -return std::move(view); - } -}; - -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj, int32_t byte_width) { -PyBytesView view; -RETURN_NOT_OK(view.FromString(obj)); -if (ARROW_PREDICT_FALSE(view.size != byte_width)) { std::stringstream ss; - ss << "expected to be length " << byte_width << " was " << view.size; + ss << "expected to be length " << type->byte_width() << " was " << view.size; return internal::InvalidValue(obj, ss.str()); -} else { - return std::move(view); } } -}; - -// -- -// Sequence converter base and CRTP "middle" subclasses -class SeqConverter; - -// Forward-declare converter factory -Status GetConverter(const std::shared_ptr& type, bool from_pandas, -bool strict_conversions, bool ignore_timezone, -std::unique_ptr* out); - -// Marshal Python sequence (list, tuple, etc.) to Arrow array -class SeqConverter { - public: - virtual ~SeqConverter() = default; - - // Initialize the sequence converter with an ArrayBuilder created - // externally. The reason for this interface is that we have - // arrow::MakeBuilder which also creates child builders for nested types, so - // we have to pass in the child builders to child SeqConverter in the case of - // converting Python objects to Arrow nested types - virtual Status Init(ArrayBuilder* builder) = 0; - - // Append a single null value to the builder - virtual Status AppendNull() = 0; - - // Append a vali
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491384129 ## File path: python/pyarrow/includes/libarrow.pxd ## @@ -969,11 +969,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: vector[shared_ptr[CScalar]] value CResult[shared_ptr[CScalar]] field(CFieldRef ref) const -cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): -cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": -shared_ptr[CScalar] index -shared_ptr[CArray] dictionary +cdef cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": +shared_ptr[CScalar] index +shared_ptr[CArray] dictionary +cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): +CDictionaryScalar(CDictionaryValue value, shared_ptr[CDataType], + c_bool is_valid) CDictionaryValue value Review comment: Updated. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491382018 ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +list_type_ = checked_cast(this->type_.get()); +list_builder_ = checked_cast(this->builder_.get()); +value_converter_ = this->children_[0]; +return Status::OK(); + } + + protected: + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; +}; + +template +class StructConverter : public BaseConverter { + public: + Status Init() override { +struct_type_ = checked_cast(this->type_.get()); +struct_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init() override { +dict_type_ = checked_cast(this->type_.get()); +value_type_ = checked_cast(dict_type_->value_type().get()); +value_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template +struct MakeConverterImpl; + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + template + using Primitive = PrimitiveConverter; + template + using List = ListConverter; + template + using Dictionary = DictionaryConverter; + using Struct = StructConverter; + + static Result> Make(std::shared_ptr type, +MemoryPool* pool, OptionsType options) { +std::shared_ptr out; +MakeConverterImpl visitor = {type, pool, options, &out}; +ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); +ARROW_RETURN_NOT_OK(out->Init()); +return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } Review comment: To not require complete type support from the implementor. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491377495 ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +list_type_ = checked_cast(this->type_.get()); +list_builder_ = checked_cast(this->builder_.get()); +value_converter_ = this->children_[0]; +return Status::OK(); + } + + protected: + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; +}; + +template +class StructConverter : public BaseConverter { + public: + Status Init() override { +struct_type_ = checked_cast(this->type_.get()); +struct_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init() override { +dict_type_ = checked_cast(this->type_.get()); +value_type_ = checked_cast(dict_type_->value_type().get()); +value_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template +struct MakeConverterImpl; + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + template + using Primitive = PrimitiveConverter; + template + using List = ListConverter; + template + using Dictionary = DictionaryConverter; + using Struct = StructConverter; + + static Result> Make(std::shared_ptr type, +MemoryPool* pool, OptionsType options) { +std::shared_ptr out; +MakeConverterImpl visitor = {type, pool, options, &out}; +ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); +ARROW_RETURN_NOT_OK(out->Init()); +return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector> children() const { return children_; } + + virtual Status Reserve(int64_t additional_capacity) { +return builder_->Reserve(additional_capacity); + } + + virtual Status AppendNull() { return builder_->AppendNull(); } + + virtual Result> ToArray() { return builder_->Finish(); } + + virtual Result> ToArray(int64_t length) { +ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); +return arr->Slice(0, length); + } + + protected: + friend struct MakeConverterImpl; + + std::shared_ptr type_; + std::shared_ptr builder_; + std::vector> children_; + OptionsType options_; +}; + +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ +return Finish>( \ +std::move(
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491377151 ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +list_type_ = checked_cast(this->type_.get()); +list_builder_ = checked_cast(this->builder_.get()); +value_converter_ = this->children_[0]; +return Status::OK(); + } + + protected: + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; +}; + +template +class StructConverter : public BaseConverter { + public: + Status Init() override { +struct_type_ = checked_cast(this->type_.get()); +struct_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init() override { +dict_type_ = checked_cast(this->type_.get()); +value_type_ = checked_cast(dict_type_->value_type().get()); +value_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template +struct MakeConverterImpl; + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + template + using Primitive = PrimitiveConverter; + template + using List = ListConverter; + template + using Dictionary = DictionaryConverter; + using Struct = StructConverter; + + static Result> Make(std::shared_ptr type, +MemoryPool* pool, OptionsType options) { +std::shared_ptr out; +MakeConverterImpl visitor = {type, pool, options, &out}; +ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); +ARROW_RETURN_NOT_OK(out->Init()); +return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector> children() const { return children_; } + + virtual Status Reserve(int64_t additional_capacity) { +return builder_->Reserve(additional_capacity); + } + + virtual Status AppendNull() { return builder_->AppendNull(); } Review comment: Updated. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491376858 ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +list_type_ = checked_cast(this->type_.get()); +list_builder_ = checked_cast(this->builder_.get()); +value_converter_ = this->children_[0]; +return Status::OK(); + } + + protected: + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; +}; + +template +class StructConverter : public BaseConverter { + public: + Status Init() override { +struct_type_ = checked_cast(this->type_.get()); +struct_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init() override { +dict_type_ = checked_cast(this->type_.get()); +value_type_ = checked_cast(dict_type_->value_type().get()); +value_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template +struct MakeConverterImpl; + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + template + using Primitive = PrimitiveConverter; + template + using List = ListConverter; + template + using Dictionary = DictionaryConverter; + using Struct = StructConverter; + + static Result> Make(std::shared_ptr type, +MemoryPool* pool, OptionsType options) { +std::shared_ptr out; +MakeConverterImpl visitor = {type, pool, options, &out}; +ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); +ARROW_RETURN_NOT_OK(out->Init()); +return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector> children() const { return children_; } + + virtual Status Reserve(int64_t additional_capacity) { Review comment: Removed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491368480 ## File path: cpp/src/arrow/util/converter.h ## @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +primitive_type_ = checked_cast(this->type_.get()); +primitive_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const T* primitive_type_; + BuilderType* primitive_builder_; +}; + +template +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + Status Init() override { +list_type_ = checked_cast(this->type_.get()); +list_builder_ = checked_cast(this->builder_.get()); +value_converter_ = this->children_[0]; +return Status::OK(); + } + + protected: + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; +}; + +template +class StructConverter : public BaseConverter { + public: + Status Init() override { +struct_type_ = checked_cast(this->type_.get()); +struct_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const StructType* struct_type_; + StructBuilder* struct_builder_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init() override { +dict_type_ = checked_cast(this->type_.get()); +value_type_ = checked_cast(dict_type_->value_type().get()); +value_builder_ = checked_cast(this->builder_.get()); +return Status::OK(); + } + + protected: + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; +}; + +template +struct MakeConverterImpl; + +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + template + using Primitive = PrimitiveConverter; + template + using List = ListConverter; + template + using Dictionary = DictionaryConverter; + using Struct = StructConverter; + + static Result> Make(std::shared_ptr type, +MemoryPool* pool, OptionsType options) { +std::shared_ptr out; +MakeConverterImpl visitor = {type, pool, options, &out}; +ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); +ARROW_RETURN_NOT_OK(out->Init()); +return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { +return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector> children() const { return children_; } + + virtual Status Reserve(int64_t additional_capacity) { +return builder_->Reserve(additional_capacity); + } + + virtual Status AppendNull() { return builder_->AppendNull(); } + + virtual Result> ToArray() { return builder_->Finish(); } Review comment: It is overridden from one of the python converters to change the type of the output array based on observed values. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491309216 ## File path: cpp/src/arrow/python/python_to_arrow.cc ## @@ -1352,64 +927,40 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz return Status::OK(); } -Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, - const PyConversionOptions& options, - std::shared_ptr* out) { +Result> ConvertPySequence(PyObject* obj, PyObject* mask, +const PyConversionOptions& opts, +MemoryPool* pool) { PyAcquireGIL lock; PyObject* seq; OwnedRef tmp_seq_nanny; - - std::shared_ptr real_type; + PyConversionOptions options = opts; // copy options struct since we modify it below Review comment: Right, updated. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #8088: ARROW-9992: [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
kszucs commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r491304867 ## File path: cpp/src/arrow/python/python_to_arrow.cc ## @@ -329,186 +300,106 @@ struct ValueConverter { default: return Status::UnknownError("Invalid time unit"); } +} else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { +return Status::NotImplemented("Expected np.timedelta64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) { -// validate that the numpy scalar has np.timedelta64 dtype -std::shared_ptr type; -RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); -if (type->id() != DurationType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString()); -} -// validate that the time units are matching -if (unit != checked_cast(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.timedelta64 objects with differing unit"); -} -// convert the numpy value -return reinterpret_cast(obj)->obval; - } -}; - -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { -PyBytesView view; -RETURN_NOT_OK(view.FromString(obj)); -return std::move(view); - } -}; + // The binary-like intermediate representation is PyBytesView because it keeps temporary + // python objects alive (non-contiguous memoryview) and stores whether the original + // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // there is a non-unicode object observed. -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { -// strict conversion, force output to be unicode / utf8 and validate that -// any binary values are utf8 -bool is_utf8 = false; -PyBytesView view; - -RETURN_NOT_OK(view.FromString(obj, &is_utf8)); -if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); -} -return std::move(view); + static Result Convert(const BaseBinaryType*, const O&, I obj) { +return PyBytesView::FromString(obj); } - static inline Result FromPython(PyObject* obj, bool* is_utf8) { -PyBytesView view; - -// Non-strict conversion; keep track of whether values are unicode or bytes -if (PyUnicode_Check(obj)) { - *is_utf8 = true; - RETURN_NOT_OK(view.FromUnicode(obj)); + static Result Convert(const FixedSizeBinaryType* type, const O&, I obj) { +ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); +if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) { + return std::move(view); } else { - // If not unicode or bytes, FromBinary will error - *is_utf8 = false; - RETURN_NOT_OK(view.FromBinary(obj)); -} -return std::move(view); - } -}; - -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj, int32_t byte_width) { -PyBytesView view; -RETURN_NOT_OK(view.FromString(obj)); -if (ARROW_PREDICT_FALSE(view.size != byte_width)) { std::stringstream ss; - ss << "expected to be length " << byte_width << " was " << view.size; + ss << "expected to be length " << type->byte_width() << " was " << view.size; return internal::InvalidValue(obj, ss.str()); -} else { - return std::move(view); } } -}; - -// -- -// Sequence converter base and CRTP "middle" subclasses -class SeqConverter; - -// Forward-declare converter factory -Status GetConverter(const std::shared_ptr& type, bool from_pandas, -bool strict_conversions, bool ignore_timezone, -std::unique_ptr* out); - -// Marshal Python sequence (list, tuple, etc.) to Arrow array -class SeqConverter { - public: - virtual ~SeqConverter() = default; - - // Initialize the sequence converter with an ArrayBuilder created - // externally. The reason for this interface is that we have - // arrow::MakeBuilder which also creates child builders for nested types, so - // we have to pass in the child builders to child SeqConverter in the case of - // converting Python objects to Arrow nested types - virtual Status Init(ArrayBuilder* builder) = 0; - - // Append a single null value to the builder - virtual Status AppendNull() = 0; - - // Append a vali