[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r413533870 ## File path: cpp/src/parquet/column_reader.cc ## @@ -50,6 +51,141 @@ using arrow::internal::checked_cast; namespace parquet { +namespace { + +inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, +const int16_t max_expected_level) { + int16_t min_level = std::numeric_limits::max(); + int16_t max_level = std::numeric_limits::min(); + for (int x = 0; x < num_levels; x++) { +min_level = std::min(levels[x], min_level); +max_level = std::max(levels[x], max_level); + } + if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_level))) { +throw ParquetException("definition level exceeds maximum"); + } +} + +#if !defined(ARROW_HAVE_BMI2) + +inline void DefinitionLevelsToBitmapScalar( +const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, +const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, +uint8_t* valid_bits, int64_t valid_bits_offset) { + // We assume here that valid_bits is large enough to accommodate the + // additional definition levels and the ones that have already been written + ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, +num_def_levels); + + // TODO(itaiin): As an interim solution we are splitting the code path here + // between repeated+flat column reads, and non-repeated+nested reads. + // Those paths need to be merged in the future + for (int i = 0; i < num_def_levels; ++i) { +if (def_levels[i] == max_definition_level) { + valid_bits_writer.Set(); +} else if (max_repetition_level > 0) { + // repetition+flat case + if (def_levels[i] == (max_definition_level - 1)) { +valid_bits_writer.Clear(); +*null_count += 1; + } else { +continue; + } +} else { + // non-repeated+nested case + if (def_levels[i] < max_definition_level) { +valid_bits_writer.Clear(); +*null_count += 1; + } else { +throw ParquetException("definition level exceeds maximum"); + } +} + +valid_bits_writer.Next(); + } + valid_bits_writer.Finish(); + *values_read = valid_bits_writer.position(); +} +#endif + +template +void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels, + const int16_t required_definition_level, + int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + constexpr int64_t kBitMaskSize = 64; + int64_t set_count = 0; + *values_read = 0; + while (num_def_levels > 0) { +int64_t batch_size = std::min(num_def_levels, kBitMaskSize); +CheckLevelRange(def_levels, batch_size, required_definition_level); +uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, + required_definition_level - 1); +if (has_repeated_parent) { + // This is currently a specialized code path assuming only (nested) lists + // present through the leaf (i.e. no structs). + // Upper level code only calls this method + // when the leaf-values are nullable (otherwise no spacing is needed), + // Because only nested lists exists it is sufficient to know that the field + // was either null or included it (i.e. >= previous definition level -> > previous + // definition - 1). If there where structs mixed in, we need to know the def_level + // of the repeated parent so we can check for def_level > "def level of repeated + // parent". + uint64_t present_bitmap = internal::GreaterThanBitmap( + def_levels, batch_size, required_definition_level - 2); + *values_read += internal::AppendSelectedBitsToValidityBitmap( + /*new_bits=*/defined_bitmap, + /*selection_bitmap*/ present_bitmap, valid_bits, &valid_bits_offset, + &set_count); +} else { + internal::AppendToValidityBitmap( + /*new_bits=*/defined_bitmap, + /*new_bit_count=*/batch_size, valid_bits, &valid_bits_offset, &set_count); + *values_read += batch_size; +} +def_levels += batch_size; +num_def_levels -= batch_size; + } + *null_count += *values_read - set_count; +} + +inline void DefinitionLevelsToBitmapDispatch( +const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, +const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, +uint8_t* valid_bits, int64_t valid_bits_offset) { + if (max_repetition_level > 0) { +#if ARROW_LITTLE_ENDIAN +// we use AVx2 as a proxy for BMI2 instruction set, since there doesn't seem to be a clean +// way o de
[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r413530683 ## File path: cpp/src/arrow/util/bit_util.h ## @@ -43,13 +43,18 @@ #if defined(_MSC_VER) #include +#include #pragma intrinsic(_BitScanReverse) #pragma intrinsic(_BitScanForward) #define ARROW_BYTE_SWAP64 _byteswap_uint64 #define ARROW_BYTE_SWAP32 _byteswap_ulong +#define ARROW_POPCOUNT64 __popcnt64 +#define ARROW_POPCOUNT32 __popcnt #else #define ARROW_BYTE_SWAP64 __builtin_bswap64 #define ARROW_BYTE_SWAP32 __builtin_bswap32 +#define ARROW_POPCOUNT64 __builtin_popcountll +#define ARROW_POPCOUNT32 __builtin_popcount Review comment: Yep, so we'll be able to use it by 2030 hopefully? :) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7014: ARROW-8563: GO Minor change to make newBuilder public
github-actions[bot] commented on issue #7014: URL: https://github.com/apache/arrow/pull/7014#issuecomment-618187424 https://issues.apache.org/jira/browse/ARROW-8563 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] mrkn commented on issue #6667: ARROW-8162: [Format][Python] Add serialization for CSF sparse tensors to Python
mrkn commented on issue #6667: URL: https://github.com/apache/arrow/pull/6667#issuecomment-618185887 @rok Thank you for working this! I'll merge this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] houqp commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns
houqp commented on issue #7009: URL: https://github.com/apache/arrow/pull/7009#issuecomment-618158794 @nevi-me rebased and tests are passing now :) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] xuancong84 opened a new issue #7017: suggestion: why not serialize complex numbers in a Python list/dict/set
xuancong84 opened a new issue #7017: URL: https://github.com/apache/arrow/issues/7017 Dear developers, I realize that complex numbers in Numpy arrays and Pandas dataframe/series can be serialized, but complex numbers in Python structures (e.g., `[1, 2.5, 3+1.j, np.nan]`) cannot be serialized. Why not make it consistent? Or is there some other reasons to concern? Thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column
fsaintjacques commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r413428461 ## File path: cpp/src/arrow/util/bit_util.h ## @@ -43,13 +43,18 @@ #if defined(_MSC_VER) #include +#include #pragma intrinsic(_BitScanReverse) #pragma intrinsic(_BitScanForward) #define ARROW_BYTE_SWAP64 _byteswap_uint64 #define ARROW_BYTE_SWAP32 _byteswap_ulong +#define ARROW_POPCOUNT64 __popcnt64 +#define ARROW_POPCOUNT32 __popcnt #else #define ARROW_BYTE_SWAP64 __builtin_bswap64 #define ARROW_BYTE_SWAP32 __builtin_bswap32 +#define ARROW_POPCOUNT64 __builtin_popcountll +#define ARROW_POPCOUNT32 __builtin_popcount Review comment: TIL, C++20 has std::popcount. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #7011: ARROW-8554: [C++][Benchmark] Fix building error "cannot bind lvalue"
wesm commented on issue #7011: URL: https://github.com/apache/arrow/pull/7011#issuecomment-618111873 manylinux1 uses gcc 4.8 but does not build the benchmarks. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on issue #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"
fsaintjacques commented on issue #7011: URL: https://github.com/apache/arrow/pull/7011#issuecomment-61869 I understood that @bkietz added an entry for this, maybe it doesn't have the benchmark enabled? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on issue #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
fsaintjacques commented on issue #7000: URL: https://github.com/apache/arrow/pull/7000#issuecomment-618110604 Addressed most comments and updated followup ticket with what's missing. PTAL and merge quickly so we can unblock the blocked tickets :) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #5947: ARROW-7300: [C++][Gandiva] Implement functions to cast from strings to integers/floats
wesm commented on issue #5947: URL: https://github.com/apache/arrow/pull/5947#issuecomment-618107435 @praveenbingo @projjal would you be able to take a look now? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"
wesm commented on issue #7011: URL: https://github.com/apache/arrow/pull/7011#issuecomment-618102656 Another gcc 4.8 issue here. We may need a more comprehensive build that also builds the benchmark executables This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on issue #6972: ARROW-8287: [Rust] Add "pretty" util to help with printing tabular output of RecordBatches
andygrove commented on issue #6972: URL: https://github.com/apache/arrow/pull/6972#issuecomment-618100341 Thanks @markhildreth for the detailed write-up in the JIRA! I've started looking through this. I'm not sure I understand all the points you made yet, but if there is a way to just expose the print_batches method and keep the pretty table details internal to utils then we should do that. I also think it is fine for the moment to ignore the use case where the schema varies between record batches and file a separate issue for that. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on issue #4140: ARROW-5123: [Rust] Parquet derive for simple structs
andygrove commented on issue #4140: URL: https://github.com/apache/arrow/pull/4140#issuecomment-618098211 @bryantbiggs I will take a look at the release tag issue this weekend. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
andygrove commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r413408444 ## File path: rust/arrow/src/array/mod.rs ## @@ -85,6 +85,7 @@ mod array; mod builder; mod data; mod equal; +mod union; Review comment: I agree that it makes sense to have submodules under array. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
andygrove commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r413408241 ## File path: rust/arrow/src/array/equal.rs ## @@ -1046,6 +1062,30 @@ impl PartialEq for Value { } } +impl JsonEqual for UnionArray { +fn equals_json(&self, _json: &[&Value]) -> bool { +unimplemented!() +} +} + +impl PartialEq for UnionArray { +fn eq(&self, json: &Value) -> bool { +match json { +Value::Array(json_array) => self.equals_json_values(&json_array), +_ => false, +} +} +} + +impl PartialEq for Value { +fn eq(&self, arrow: &UnionArray) -> bool { +match self { Review comment: nit: these two `eq` methods are very similar, but in one, you match on `self` and then compare to the argument and in the other one, this is transposed. It would be nice to make these consistent. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
andygrove commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r413407658 ## File path: rust/arrow/src/array/equal.rs ## @@ -692,6 +692,22 @@ impl ArrayEqual for StructArray { } } +impl ArrayEqual for UnionArray { +fn equals(&self, _other: &dyn Array) -> bool { +unimplemented!() +} + +fn range_equals( +&self, +_other: &dyn Array, +_start_idx: usize, +_end_idx: usize, +_other_start_idx: usize, +) -> bool { +unimplemented!() Review comment: Is this intentionally unimplemented? If so, would be good to add a comment explaining that. ## File path: rust/arrow/src/array/equal.rs ## @@ -1046,6 +1062,30 @@ impl PartialEq for Value { } } +impl JsonEqual for UnionArray { +fn equals_json(&self, _json: &[&Value]) -> bool { +unimplemented!() Review comment: Is this intentionally unimplemented? If so, would be good to add a comment explaining that. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
andygrove commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r413407264 ## File path: rust/arrow/src/array/equal.rs ## @@ -692,6 +692,22 @@ impl ArrayEqual for StructArray { } } +impl ArrayEqual for UnionArray { +fn equals(&self, _other: &dyn Array) -> bool { +unimplemented!() Review comment: Is this intentionally unimplemented? If so, would be good to add a comment explaining that. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns
andygrove commented on a change in pull request #6770: URL: https://github.com/apache/arrow/pull/6770#discussion_r413404556 ## File path: rust/datafusion/src/utils.rs ## @@ -120,6 +143,7 @@ pub fn array_value_to_string(column: array::ArrayRef, row: usize) -> Result { make_string!(array::Time64NanosecondArray, column, row) } +DataType::List(_) => make_string_from_list!(column, row), Review comment: Does it matter what data type the list is? Can lists of any type be converted into strings? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns
andygrove commented on a change in pull request #6770: URL: https://github.com/apache/arrow/pull/6770#discussion_r413404215 ## File path: rust/datafusion/src/utils.rs ## @@ -74,6 +74,29 @@ macro_rules! make_string { }}; } +macro_rules! make_string_from_list { +($column: ident, $row: ident) => {{ +let list = $column +.as_any() +.downcast_ref::() +.unwrap() +.value($row); +let string_values = match (0..list.len()) +.map(|i| array_value_to_string(list.clone(), i)) +.collect::>>() +{ +Ok(values) => values, +_ => { +return Err(ExecutionError::ExecutionError(format!( +"Unsupported {:?} type for repl.", Review comment: What does `repl` mean here? Maybe this error could be a bit more descriptive? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns
andygrove commented on a change in pull request #6770: URL: https://github.com/apache/arrow/pull/6770#discussion_r413403760 ## File path: rust/datafusion/src/utils.rs ## @@ -74,6 +74,29 @@ macro_rules! make_string { }}; } +macro_rules! make_string_from_list { +($column: ident, $row: ident) => {{ +let list = $column +.as_any() +.downcast_ref::() +.unwrap() Review comment: `expect` would be better than `unwrap`. Using a `Result` would be better than `expect`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns
andygrove commented on a change in pull request #6770: URL: https://github.com/apache/arrow/pull/6770#discussion_r413403453 ## File path: rust/datafusion/src/logicalplan.rs ## @@ -828,8 +828,8 @@ mod tests { .build()?; let expected = "Projection: #id\ -\n Selection: #state Eq Utf8(\"CO\")\ -\nTableScan: employee.csv projection=Some([0, 3])"; +\n Selection: #state Eq Utf8(\"CO\")\ +\nTableScan: employee.csv projection=Some([0, 3])"; Review comment: Are these formatting changes from running rustfmt? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] mcassels commented on issue #6770: ARROW-7842 [Parquet][Rust] implement array_reader for list type columns
mcassels commented on issue #6770: URL: https://github.com/apache/arrow/pull/6770#issuecomment-618082896 @andygrove @nevi-me do you have any thoughts on this? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kou commented on issue #6996: ARROW-8538: [Packaging] Remove boost from homebrew formula
kou commented on issue #6996: URL: https://github.com/apache/arrow/pull/6996#issuecomment-618066972 Could you make a JIRA? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on issue #6996: ARROW-8538: [Packaging] Remove boost from homebrew formula
nealrichardson commented on issue #6996: URL: https://github.com/apache/arrow/pull/6996#issuecomment-618053499 @kou are you adding `osx_image: xcode11.3` somewhere already or should I make a JIRA? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()
github-actions[bot] commented on issue #7016: URL: https://github.com/apache/arrow/pull/7016#issuecomment-618051647 https://issues.apache.org/jira/browse/ARROW-8561 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()
github-actions[bot] commented on issue #7016: URL: https://github.com/apache/arrow/pull/7016#issuecomment-618049644 Revision: e6ed98341efbd0f7bfed30a7aaf12935afb85fa5 Submitted crossbow builds: [ursa-labs/crossbow @ actions-164](https://github.com/ursa-labs/crossbow/branches/all?query=actions-164) |Task|Status| ||--| |gandiva-jar-osx|[![TravisCI](https://img.shields.io/travis/ursa-labs/crossbow/actions-164-travis-gandiva-jar-osx.svg)](https://travis-ci.org/ursa-labs/crossbow/branches)| |gandiva-jar-xenial|[![TravisCI](https://img.shields.io/travis/ursa-labs/crossbow/actions-164-travis-gandiva-jar-xenial.svg)](https://travis-ci.org/ursa-labs/crossbow/branches)| This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kou commented on issue #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()
kou commented on issue #7016: URL: https://github.com/apache/arrow/pull/7016#issuecomment-618049071 @github-actions crossbow submit -g gandiva This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kou opened a new pull request #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()
kou opened a new pull request #7016: URL: https://github.com/apache/arrow/pull/7016 ByteSize() is deprecated and ByteSizeLong() is added since Protobuf 3.4.0. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #7001: Use lowercase ws2_32 everywhere
wesm commented on issue #7001: URL: https://github.com/apache/arrow/pull/7001#issuecomment-618021001 @davidanthoff would you mind opening a JIRA issue for this and updating the PR title? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7015: ARROW-8560: [Rust] Docs for MutableBuffer resize are incorrect
github-actions[bot] commented on issue #7015: URL: https://github.com/apache/arrow/pull/7015#issuecomment-618016015 https://issues.apache.org/jira/browse/ARROW-8560 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] paddyhoran opened a new pull request #7015: ARROW-8560: [Rust] Docs for MutableBuffer resize are incorrect
paddyhoran opened a new pull request #7015: URL: https://github.com/apache/arrow/pull/7015 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #6981: PARQUET-1845: [C++] Add expected results of Int96 in big-endian
wesm commented on issue #6981: URL: https://github.com/apache/arrow/pull/6981#issuecomment-617947022 I haven't had a chance to look in detail. Perhaps someone on the Parquet mailing list might be able to help This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on issue #6981: PARQUET-1845: [C++] Add expected results of Int96 in big-endian
kiszk commented on issue #6981: URL: https://github.com/apache/arrow/pull/6981#issuecomment-617935417 @wesm do you have any comment on this change? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sunchao commented on a change in pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)
sunchao commented on a change in pull request #6949: URL: https://github.com/apache/arrow/pull/6949#discussion_r413158295 ## File path: rust/parquet/src/util/io.rs ## @@ -31,47 +33,83 @@ pub trait Position { } /// Struct that represents a slice of a file data with independent start position and -/// length. Internally clones provided file handle, wraps with BufReader and resets -/// position before any read. +/// length. Internally clones provided file handle, wraps with a custom implementation +/// of BufReader that resets position before any read. /// /// This is workaround and alternative for `file.try_clone()` method. It clones `File` /// while preserving independent position, which is not available with `try_clone()`. /// -/// Designed after `arrow::io::RandomAccessFile`. +/// Designed after `arrow::io::RandomAccessFile` and `std::io::BufReader` pub struct FileSource { -reader: Mutex>, -start: u64, // start position in a file -end: u64, // end position in a file +reader: RefCell, +start: u64, // start position in a file +end: u64, // end position in a file +buf: Vec, // the internal buffer `buf` in BufReader +buf_pos: usize, // equivalent to the `pos` param in BufReader Review comment: This is not super useful as it requires ppl to jump into `BufReader` to check the documentation - maybe add additional comments on what these two fields are for? ## File path: rust/parquet/src/util/io.rs ## @@ -31,47 +33,83 @@ pub trait Position { } /// Struct that represents a slice of a file data with independent start position and -/// length. Internally clones provided file handle, wraps with BufReader and resets -/// position before any read. +/// length. Internally clones provided file handle, wraps with a custom implementation +/// of BufReader that resets position before any read. /// /// This is workaround and alternative for `file.try_clone()` method. It clones `File` /// while preserving independent position, which is not available with `try_clone()`. /// -/// Designed after `arrow::io::RandomAccessFile`. +/// Designed after `arrow::io::RandomAccessFile` and `std::io::BufReader` pub struct FileSource { -reader: Mutex>, -start: u64, // start position in a file -end: u64, // end position in a file +reader: RefCell, +start: u64, // start position in a file +end: u64, // end position in a file +buf: Vec, // the internal buffer `buf` in BufReader +buf_pos: usize, // equivalent to the `pos` param in BufReader +buf_cap: usize, // equivalent to the `cap` param in BufReader } impl FileSource { /// Creates new file reader with start and length from a file handle pub fn new(fd: &R, start: u64, length: usize) -> Self { +let reader = RefCell::new(fd.try_clone().unwrap()); Self { -reader: Mutex::new(BufReader::new(fd.try_clone().unwrap())), +reader, start, end: start + length as u64, +buf: vec![0 as u8; DEFAULT_BUF_SIZE], +buf_pos: 0, +buf_cap: 0, } } + +// inspired from BufReader +fn fill_buf(&mut self) -> Result<&[u8]> { +if self.buf_pos >= self.buf_cap { +// If we've reached the end of our internal buffer then we need to fetch +// some more data from the underlying reader. +// Branch using `>=` instead of the more correct `==` +// to tell the compiler that the pos..cap slice is always valid. +debug_assert!(self.buf_pos == self.buf_cap); +let mut reader = self.reader.borrow_mut(); +reader.seek(SeekFrom::Start(self.start))?; // always seek to start before reading +self.buf_cap = reader.read(&mut self.buf)?; +self.buf_pos = 0; +} +Ok(&self.buf[self.buf_pos..self.buf_cap]) +} } impl Read for FileSource { fn read(&mut self, buf: &mut [u8]) -> Result { -let mut reader = self -.reader -.lock() -.map_err(|err| Error::new(ErrorKind::Other, err.to_string()))?; - let bytes_to_read = cmp::min(buf.len(), (self.end - self.start) as usize); let buf = &mut buf[0..bytes_to_read]; -reader.seek(SeekFrom::Start(self.start as u64))?; -let res = reader.read(buf); -if let Ok(bytes_read) = res { -self.start += bytes_read as u64; +// If we don't have any buffered data and we're doing a massive read +// (larger than our internal buffer), bypass our internal buffer +// entirely. +if self.buf_pos == self.buf_cap && buf.len() >= self.buf.len() { +// discard buffer +self.buf_pos = 0; +self.buf_cap = 0; +// read directly into param buffer +let mut reader = self.reader.borrow_mut(); Review comment: nit: m
[GitHub] [arrow] github-actions[bot] commented on issue #6995: ARROW-8549: [R] Assorted post-0.17 release cleanups
github-actions[bot] commented on issue #6995: URL: https://github.com/apache/arrow/pull/6995#issuecomment-617926769 Revision: 61e54eb79986e3946e15c373c76e3eee4294dd44 Submitted crossbow builds: [ursa-labs/crossbow @ actions-163](https://github.com/ursa-labs/crossbow/branches/all?query=actions-163) |Task|Status| ||--| |test-conda-r-3.6|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-163-circle-test-conda-r-3.6.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-163-circle-test-conda-r-3.6)| |test-ubuntu-18.04-r-3.6|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-163-circle-test-ubuntu-18.04-r-3.6.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-163-circle-test-ubuntu-18.04-r-3.6)| This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on issue #6995: ARROW-8549: [R] Assorted post-0.17 release cleanups
nealrichardson commented on issue #6995: URL: https://github.com/apache/arrow/pull/6995#issuecomment-617926040 @github-actions crossbow submit test-conda-r-3.6 test-ubuntu-18.04-r-3.6 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] houqp commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns
houqp commented on issue #7009: URL: https://github.com/apache/arrow/pull/7009#issuecomment-617914513 Thanks @nevi-me for the fix, looks like your PR has been approved. I will wait for the merge. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
fsaintjacques commented on a change in pull request #7000: URL: https://github.com/apache/arrow/pull/7000#discussion_r413164872 ## File path: python/pyarrow/_dataset.pyx ## @@ -519,30 +500,69 @@ cdef class Fragment: """ return Expression.wrap(self.fragment.partition_expression()) -def to_table(self, use_threads=True, MemoryPool memory_pool=None): -"""Convert this Fragment into a Table. +def _scanner(self, **kwargs): +return Scanner.from_fragment(self, **kwargs) -Use this convenience utility with care. This will serially materialize -the Scan result in memory before creating the Table. +def scan(self, columns=None, filter=None, use_threads=True, + MemoryPool memory_pool=None, **kwargs): +"""Builds a scan operation against the dataset. + Review comment: Yes, but you can still pass schema (which is not documented. I'll add this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7014: Go: Minor change to make newBuilder public to aid upstream
github-actions[bot] commented on issue #7014: URL: https://github.com/apache/arrow/pull/7014#issuecomment-617881461 Thanks for opening a pull request! Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW Then could you also rename pull request title in the following format? ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} See also: * [Other pull requests](https://github.com/apache/arrow/pulls/) * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] mindhash opened a new pull request #7014: Go: Minor change to make newBuilder public to aid upstream
mindhash opened a new pull request #7014: URL: https://github.com/apache/arrow/pull/7014 Hello team, This minor change makes newBuilder() public to reduce verbosity in upstream. To give you example, I am working on a parquet read / write into Arrow Record batch where the parquet data types are mapped to Arrow data types. My Repo: https://github.com/mindhash/arrow-parquet-go In such cases, it will be nice to have a builder API (newBuilder) be generic to accept a data type and return a respective array. I am looking at a similar situation for JSON reader. I think this change will make the builder API much easier for upstream as well as internal packages. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on issue #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
kszucs commented on issue #7000: URL: https://github.com/apache/arrow/pull/7000#issuecomment-617868408 @ursabot build This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files
pitrou commented on a change in pull request #6954: URL: https://github.com/apache/arrow/pull/6954#discussion_r413092039 ## File path: cpp/src/arrow/util/hash_util.h ## @@ -27,39 +27,27 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/neon_util.h" -#include "arrow/util/sse_util.h" +#include "arrow/util/simd.h" -static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) { +#ifdef ARROW_HAVE_SSE4_2 Review comment: Also, regardless of whether we move them or not, you should put them in the `arrow::internal` namespace. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on issue #6954: ARROW-8440: [C++] Refine SIMD header files
kiszk commented on issue #6954: URL: https://github.com/apache/arrow/pull/6954#issuecomment-617854855 Is the function `Armv8CrcHashParallel` uses somewhere? Sorry if I overlook it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk edited a comment on issue #6954: ARROW-8440: [C++] Refine SIMD header files
kiszk edited a comment on issue #6954: URL: https://github.com/apache/arrow/pull/6954#issuecomment-617854855 Is the function `Armv8CrcHashParallel` used somewhere? Sorry if I overlook it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
fsaintjacques commented on a change in pull request #7000: URL: https://github.com/apache/arrow/pull/7000#discussion_r413074200 ## File path: cpp/src/arrow/dataset/dataset.cc ## @@ -30,34 +30,40 @@ namespace arrow { namespace dataset { -Fragment::Fragment(std::shared_ptr scan_options) -: scan_options_(std::move(scan_options)), partition_expression_(scalar(true)) {} +Fragment::Fragment(std::shared_ptr partition_expression) +: partition_expression_(partition_expression ? partition_expression : scalar(true)) {} -const std::shared_ptr& Fragment::schema() const { - return scan_options_->schema(); -} +Result> InMemoryFragment::ReadPhysicalSchema() { Review comment: The effort to open a JIRA is equal to implementing it. I added it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #6982: ARROW-8514: [Developer][Release] Verify Python 3.5 Windows wheel
wesm commented on issue #6982: URL: https://github.com/apache/arrow/pull/6982#issuecomment-617827466 +1 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #7003: from pyarrow import parquet fails with AttributeError: type object 'pyarrow._parquet.Statistics' has no attribute '__reduce_cython__'
wesm commented on issue #7003: URL: https://github.com/apache/arrow/issues/7003#issuecomment-617827061 Can you please open a JIRA issue and provide instructions to reproduce? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7012: ARROW-8555: [FlightRPC][Java] implement DoExchange
github-actions[bot] commented on issue #7012: URL: https://github.com/apache/arrow/pull/7012#issuecomment-617824864 https://issues.apache.org/jira/browse/ARROW-8555 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7013: ARROW-8512: [C++] Remove unused expression/operator prototype code
github-actions[bot] commented on issue #7013: URL: https://github.com/apache/arrow/pull/7013#issuecomment-617824867 https://issues.apache.org/jira/browse/ARROW-8512 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
fsaintjacques commented on a change in pull request #7000: URL: https://github.com/apache/arrow/pull/7000#discussion_r413045582 ## File path: cpp/src/arrow/dataset/file_base.cc ## @@ -222,9 +214,8 @@ FragmentIterator FileSystemDataset::GetFragmentsImpl( } Result> FileSystemDataset::Write( -const WritePlan& plan, std::shared_ptr scan_context) { - std::vector> options(plan.paths.size()); - +const WritePlan& plan, std::shared_ptr scan_options, Review comment: That's a task for [ARROW-8382](https://jira.apache.org/jira/projects/ARROW/issues/ARROW-8382). The writer shouldn't be aware of such issue, it takes a file source, a format, a RecordBatchReader (the dropping should happen internally here) and writes itÂ. Since the Writer API has no bindings and not in use yet, I added this to ARROW-8382 so we can prioritise ARROW-8062 (parquet dataset) and ARROW-8318 (fragment stored in dataset). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
fsaintjacques commented on a change in pull request #7000: URL: https://github.com/apache/arrow/pull/7000#discussion_r413046083 ## File path: cpp/src/arrow/dataset/dataset.h ## @@ -100,16 +97,20 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment { RecordBatchVector record_batches_; }; -/// \brief A container of zero or more Fragments. A Dataset acts as a discovery mechanism -/// of Fragments and partitions, e.g. files deeply nested in a directory. +/// \brief A container of zero or more Fragments. +/// +/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a +/// directory. A Dataset has a schema, also known as the "reader" schema. +/// class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this { public: /// \brief Begin to build a new Scan operation against this Dataset Result> NewScan(std::shared_ptr context); Result> NewScan(); - /// \brief GetFragments returns an iterator of Fragments given ScanOptions. - FragmentIterator GetFragments(std::shared_ptr options); + /// \brief GetFragments returns an iterator of Fragments given a predicate. + FragmentIterator GetFragments(std::shared_ptr predicate); + FragmentIterator GetFragments(); Review comment: Ditto. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
fsaintjacques commented on a change in pull request #7000: URL: https://github.com/apache/arrow/pull/7000#discussion_r413045582 ## File path: cpp/src/arrow/dataset/file_base.cc ## @@ -222,9 +214,8 @@ FragmentIterator FileSystemDataset::GetFragmentsImpl( } Result> FileSystemDataset::Write( -const WritePlan& plan, std::shared_ptr scan_context) { - std::vector> options(plan.paths.size()); - +const WritePlan& plan, std::shared_ptr scan_options, Review comment: That's a task for [ARROW-8382](https://jira.apache.org/jira/projects/ARROW/issues/ARROW-8382). The writer shouldn't be aware of such issue, it takes a file source, a format, a RecordBatchReader (the dropping should happen internally here) and writes itÂ. Since the Writer API has no bindings and not in used, I added this to ARROW-8382 so we can prioritise ARROW-8062 (parquet dataset) and ARROW-8318 (fragment stored in dataset). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm opened a new pull request #7013: ARROW-8512: [C++] Remove unused expression/operator prototype code
wesm opened a new pull request #7013: URL: https://github.com/apache/arrow/pull/7013 None of this code was ever used. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] lidavidm opened a new pull request #7012: ARROW-8555: [FlightRPC][Java] implement DoExchange
lidavidm opened a new pull request #7012: URL: https://github.com/apache/arrow/pull/7012 This is a complete implementation of DoExchange for Java. It is not tested against the C++ implementation yet, however, it still passes integration tests, so the internal refactoring should not have broken compatibility with existing clients/servers. In this PR, I've refactored DoGet/DoPut/DoExchange on the client and server to share their implementation as much as possible. DoGet/DoPut retain their behavior of "eagerly" reading/writing schemas, but DoExchange allows the client/server to delay writing the schema until ready. This is checked in the unit tests. I also ran into some test flakes and tried to address them, by making sure we clean up things in the right order, and adding missing `close()` calls in some existing tests. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files
kiszk commented on a change in pull request #6954: URL: https://github.com/apache/arrow/pull/6954#discussion_r413030123 ## File path: cpp/src/arrow/util/hash_util.h ## @@ -27,39 +27,27 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/neon_util.h" -#include "arrow/util/sse_util.h" +#include "arrow/util/simd.h" -static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) { +#ifdef ARROW_HAVE_SSE4_2 Review comment: How about moving this part into `simd.h`? We can reduce the number of places that depends on the target hardware. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"
github-actions[bot] commented on issue #7011: URL: https://github.com/apache/arrow/pull/7011#issuecomment-617806628 https://issues.apache.org/jira/browse/ARROW-8554 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] lidavidm commented on issue #6656: ARROW-8297: [FlightRPC][C++] Implement Flight DoExchange for C++
lidavidm commented on issue #6656: URL: https://github.com/apache/arrow/pull/6656#issuecomment-617806080 This should be ready now. The only issue I have is I could not figure out how to get GCC 4.8 to use the unique_ptr overload of a method instead of the shared_ptr overload. https://github.com/apache/arrow/pull/6656/files#diff-34371f71b4b1eecc5f855dd3dbd3340cR207-R214 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] plusplusjiajia opened a new pull request #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"
plusplusjiajia opened a new pull request #7011: URL: https://github.com/apache/arrow/pull/7011 https://issues.apache.org/jira/browse/ARROW-8554 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation
fsaintjacques commented on a change in pull request #7000: URL: https://github.com/apache/arrow/pull/7000#discussion_r413007995 ## File path: cpp/src/arrow/dataset/dataset.h ## @@ -84,13 +82,12 @@ class ARROW_DS_EXPORT Fragment { class ARROW_DS_EXPORT InMemoryFragment : public Fragment { public: InMemoryFragment(RecordBatchVector record_batches, - std::shared_ptr scan_options); + std::shared_ptr = NULLPTR); Review comment: Doesn't compile. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column
kiszk commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r412990504 ## File path: cpp/src/parquet/column_reader.cc ## @@ -50,6 +51,141 @@ using arrow::internal::checked_cast; namespace parquet { +namespace { + +inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, +const int16_t max_expected_level) { + int16_t min_level = std::numeric_limits::max(); + int16_t max_level = std::numeric_limits::min(); + for (int x = 0; x < num_levels; x++) { +min_level = std::min(levels[x], min_level); +max_level = std::max(levels[x], max_level); + } + if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_level))) { +throw ParquetException("definition level exceeds maximum"); + } +} + +#if !defined(ARROW_HAVE_BMI2) + +inline void DefinitionLevelsToBitmapScalar( +const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, +const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, +uint8_t* valid_bits, int64_t valid_bits_offset) { + // We assume here that valid_bits is large enough to accommodate the + // additional definition levels and the ones that have already been written + ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, +num_def_levels); + + // TODO(itaiin): As an interim solution we are splitting the code path here + // between repeated+flat column reads, and non-repeated+nested reads. + // Those paths need to be merged in the future + for (int i = 0; i < num_def_levels; ++i) { +if (def_levels[i] == max_definition_level) { + valid_bits_writer.Set(); +} else if (max_repetition_level > 0) { + // repetition+flat case + if (def_levels[i] == (max_definition_level - 1)) { +valid_bits_writer.Clear(); +*null_count += 1; + } else { +continue; + } +} else { + // non-repeated+nested case + if (def_levels[i] < max_definition_level) { +valid_bits_writer.Clear(); +*null_count += 1; + } else { +throw ParquetException("definition level exceeds maximum"); + } +} + +valid_bits_writer.Next(); + } + valid_bits_writer.Finish(); + *values_read = valid_bits_writer.position(); +} +#endif + +template +void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels, + const int16_t required_definition_level, + int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + constexpr int64_t kBitMaskSize = 64; + int64_t set_count = 0; + *values_read = 0; + while (num_def_levels > 0) { +int64_t batch_size = std::min(num_def_levels, kBitMaskSize); +CheckLevelRange(def_levels, batch_size, required_definition_level); +uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, + required_definition_level - 1); +if (has_repeated_parent) { + // This is currently a specialized code path assuming only (nested) lists + // present through the leaf (i.e. no structs). + // Upper level code only calls this method + // when the leaf-values are nullable (otherwise no spacing is needed), + // Because only nested lists exists it is sufficient to know that the field + // was either null or included it (i.e. >= previous definition level -> > previous + // definition - 1). If there where structs mixed in, we need to know the def_level + // of the repeated parent so we can check for def_level > "def level of repeated + // parent". + uint64_t present_bitmap = internal::GreaterThanBitmap( + def_levels, batch_size, required_definition_level - 2); + *values_read += internal::AppendSelectedBitsToValidityBitmap( + /*new_bits=*/defined_bitmap, + /*selection_bitmap*/ present_bitmap, valid_bits, &valid_bits_offset, + &set_count); +} else { + internal::AppendToValidityBitmap( + /*new_bits=*/defined_bitmap, + /*new_bit_count=*/batch_size, valid_bits, &valid_bits_offset, &set_count); + *values_read += batch_size; +} +def_levels += batch_size; +num_def_levels -= batch_size; + } + *null_count += *values_read - set_count; +} + +inline void DefinitionLevelsToBitmapDispatch( +const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, +const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, +uint8_t* valid_bits, int64_t valid_bits_offset) { + if (max_repetition_level > 0) { +#if ARROW_LITTLE_ENDIAN +// we use AVx2 as a proxy for BMI2 instruction set, since there doesn't seem to be a clean +// way o detect t
[GitHub] [arrow] lidavidm commented on issue #6744: PARQUET-1820: [C++] pre-buffer specified columns of row group
lidavidm commented on issue #6744: URL: https://github.com/apache/arrow/pull/6744#issuecomment-617784720 Thanks for the review! > It might be nice to have a convenience for prebuffering an entire row group. Something like > auto rg = file_reader->RowGroup(i); > rg->PreBufferAllColumns(); I think it's not worth it given this is a relatively low-level API, but may be handy if you do expect people to be using this directly often (instead of via ArrowReaderProperties). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column
kiszk commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r412987615 ## File path: cpp/src/parquet/CMakeLists.txt ## @@ -336,6 +336,7 @@ set_source_files_properties(public_api_test.cc add_parquet_test(reader_test SOURCES column_reader_test.cc +level_conversion_test.cc Review comment: nit: indentation This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] lidavidm commented on a change in pull request #6744: PARQUET-1820: [C++] pre-buffer specified columns of row group
lidavidm commented on a change in pull request #6744: URL: https://github.com/apache/arrow/pull/6744#discussion_r412987199 ## File path: cpp/src/parquet/properties.h ## @@ -56,10 +60,32 @@ class PARQUET_EXPORT ReaderProperties { bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; } + bool is_coalesced_stream_enabled() const { return coalesced_stream_enabled_; } + void enable_buffered_stream() { buffered_stream_enabled_ = true; } void disable_buffered_stream() { buffered_stream_enabled_ = false; } + /// Enable read coalescing. + /// + /// When enabled, readers can optionally call + /// ParquetFileReader.PreBuffer to cache the necessary slices of the + /// file in-memory before deserialization. Arrow readers + /// automatically do this. This is intended to increase performance + /// when reading from high-latency filesystems (e.g. Amazon S3). + /// + /// When this is enabled, it is NOT SAFE to concurrently create + /// RecordBatchReaders from the same file. Review comment: I've updated the wording - it's not unsafe (in that it won't crash), but it'll fail if you try to read a row group/column chunk that wasn't cached. This is because there's one shared cache that is not specific to a particular reader. Looking through the code, we could make this work better by avoiding a PreBuffer method and instead threading the cache all the way from the Arrow readers to the Parquet implementation. I think that's possible, but would require passing the cache between several classes/methods - do you think that's worth it? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] lidavidm commented on a change in pull request #6744: PARQUET-1820: [C++] pre-buffer specified columns of row group
lidavidm commented on a change in pull request #6744: URL: https://github.com/apache/arrow/pull/6744#discussion_r412985177 ## File path: cpp/src/parquet/file_reader.h ## @@ -117,6 +117,15 @@ class PARQUET_EXPORT ParquetFileReader { // Returns the file metadata. Only one instance is ever created std::shared_ptr metadata() const; + /// Pre-buffer the specified column indices in all row groups. + /// + /// Only has an effect if ReaderProperties.is_coalesced_stream_enabled is set; + /// otherwise this is a no-op. The reader internally maintains a cache which is + /// overwritten each time this is called. Intended to increase performance on + /// high-latency filesystems (e.g. Amazon S3). + void PreBuffer(const std::vector& row_groups, + const std::vector& column_indices); Review comment: I changed this to accept the CacheOptions struct here, which gives the caller full control each time they invoke it. ## File path: cpp/src/parquet/file_reader.h ## @@ -117,6 +117,15 @@ class PARQUET_EXPORT ParquetFileReader { // Returns the file metadata. Only one instance is ever created std::shared_ptr metadata() const; + /// Pre-buffer the specified column indices in all row groups. + /// + /// Only has an effect if ReaderProperties.is_coalesced_stream_enabled is set; Review comment: Yes - that makes more sense, combined with moving the flag to ArrowReaderProperties. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] paddyhoran commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
paddyhoran commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r412974468 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values +/// are used to index into the `child_arrays`. +/// +/// The `value_offsets` `Buffer` should contain `i32` values. These values should be greater Review comment: Yea, I only have a single constructor for both types that takes an `Option` for the `value_offsets`. The comment should point this out, I'll update. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to
[GitHub] [arrow] paddyhoran commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
paddyhoran commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r412972235 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values Review comment: I was just going but [this](https://arrow.apache.org/docs/format/Columnar.html#union-layout). It's not immediately obvious why you would need that. I'll have to study the C++ implementation. I'll open a follow up PR. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure
[GitHub] [arrow] pitrou commented on issue #6997: ARROW-8540: [C++] Add memory allocation benchmarks
pitrou commented on issue #6997: URL: https://github.com/apache/arrow/pull/6997#issuecomment-617760416 > I tried with multiple threads and the numbers look very similar between SystemAlloc and Jemalloc when I would have expected Jemalloc to do better here. Well, it's a trivial micro-benchmark. But the glibc allocator isn't that bad anyway. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on a change in pull request #6997: ARROW-8540: [C++] Add memory allocation benchmarks
fsaintjacques commented on a change in pull request #6997: URL: https://github.com/apache/arrow/pull/6997#discussion_r412951352 ## File path: cpp/src/arrow/memory_pool_benchmark.cc ## @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/util/logging.h" + +#include "benchmark/benchmark.h" + +namespace arrow { + +struct SystemAlloc { + static Result GetAllocator() { return system_memory_pool(); } +}; + +#ifdef ARROW_JEMALLOC +struct Jemalloc { + static Result GetAllocator() { +MemoryPool* pool; +RETURN_NOT_OK(jemalloc_memory_pool(&pool)); +return pool; + } +}; +#endif + +#ifdef ARROW_MIMALLOC +struct Mimalloc { + static Result GetAllocator() { +MemoryPool* pool; +RETURN_NOT_OK(mimalloc_memory_pool(&pool)); +return pool; + } +}; +#endif + +static void TouchCacheLines(uint8_t* data, int64_t nbytes) { + uint8_t total = 0; + while (nbytes > 0) { +total += *data; +data += 64; +nbytes -= 64; + } + benchmark::DoNotOptimize(total); +} + +// Benchmark the raw cost of allocating memory. +// Note this is a best case situation: we always allocate and deallocate exactly +// the same size, without any other allocator traffic. However, it can be +// representative of workloads where we routinely create and destroy +// temporary buffers for intermediate computation results. +template +static void AllocateDeallocate(benchmark::State& state) { // NOLINT non-const reference + const int64_t nbytes = state.range(0); + MemoryPool* pool = *Alloc::GetAllocator(); + + for (auto _ : state) { +uint8_t* data; +ARROW_CHECK_OK(pool->Allocate(nbytes, &data)); +pool->Free(data, nbytes); + } +} + +// Benchmark the cost of allocating memory plus accessing it. +template +static void AllocateTouchDeallocate( +benchmark::State& state) { // NOLINT non-const reference + const int64_t nbytes = state.range(0); + MemoryPool* pool = *Alloc::GetAllocator(); + + for (auto _ : state) { +uint8_t* data; +ARROW_CHECK_OK(pool->Allocate(nbytes, &data)); +TouchCacheLines(data, nbytes); +pool->Free(data, nbytes); + } +} + +// Benchmark the cost of accessing always the same memory area. +static void TouchArea(benchmark::State& state) { // NOLINT non-const reference Review comment: I didn't understand at first, is this because you want to measure the difference between `AllocateDeallocate` and `AllocateTouchDeallocate` and this test gives you a rough number to validate said gap? If so, add a comment on top of it and move this as the first test to help reading. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on issue #6981: PARQUET-1845: [C++] Add expected results of Int96 in big-endian
pitrou commented on issue #6981: URL: https://github.com/apache/arrow/pull/6981#issuecomment-617744347 As a sidenote, I think you may want to start with Arrow unittests before trying to make Parquet unittests successful. Parquet relies on many Arrow facilities. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #6992: ARROW-7950: [Python] Determine + test minimal pandas version + raise error when pandas is too old
pitrou commented on a change in pull request #6992: URL: https://github.com/apache/arrow/pull/6992#discussion_r412929769 ## File path: python/pyarrow/tests/test_pandas.py ## @@ -2685,8 +2685,8 @@ class A: 'a': pd.period_range('2000-01-01', periods=20), }) -expected_msg = 'Conversion failed for column a with type period' -with pytest.raises(TypeError, match=expected_msg): +expected_msg = 'Conversion failed for column a with type period|object' Review comment: Do you mean `(period|object)`? (parentheses included) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #6992: ARROW-7950: [Python] Determine + test minimal pandas version + raise error when pandas is too old
pitrou commented on a change in pull request #6992: URL: https://github.com/apache/arrow/pull/6992#discussion_r412929368 ## File path: python/pyarrow/pandas-shim.pxi ## @@ -55,6 +55,16 @@ cdef class _PandasAPIShim(object): from distutils.version import LooseVersion self._loose_version = LooseVersion(pd.__version__) +if self._loose_version < LooseVersion('0.23.0'): +self._have_pandas = False +if raise_: +raise ImportError( +"pyarrow requires pandas 0.23.0 or above, pandas {} is " +"installed".format(self._version) +) +else: +return Review comment: Could we perhaps emit a warning here? I don't think that users expect their Pandas installation to be silently ignored. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
pitrou commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r412926608 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values +/// are used to index into the `child_arrays`. +/// +/// The `value_offsets` `Buffer` should contain `i32` values. These values should be greater Review comment: I don't know anything about Rust, but I don't see a constructor for sparse Union arrays. Sparse Union arrays don't have a offsets buffer. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comm
[GitHub] [arrow] pitrou commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
pitrou commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r412925276 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values Review comment: Note that the Union type in C++ and in the IPC format allows you to specify logical type ids. For example if creating `UnionType({float32, utf8}, type_ids=[5, 7])`, then a UnionArray's type ids buffer could be something like `[5, 7, 7]` for values such as `[1.5, "foo", "bar"]`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queri
[GitHub] [arrow] pitrou commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
pitrou commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r412925276 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values Review comment: Note that the Union type in C++ and in the IPC format allows you to specify logical type ids. For example if creating `UnionType({float32, int32}, type_ids=[5, 7])`, then a UnionArray's type ids buffer could be something like `[5, 7, 7]`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Inf
[GitHub] [arrow] github-actions[bot] commented on issue #7010: [Rust] [CI]: fix rustfmt failures
github-actions[bot] commented on issue #7010: URL: https://github.com/apache/arrow/pull/7010#issuecomment-617680858 Thanks for opening a pull request! Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW Then could you also rename pull request title in the following format? ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} See also: * [Other pull requests](https://github.com/apache/arrow/pulls/) * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jianxind commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files
jianxind commented on a change in pull request #6954: URL: https://github.com/apache/arrow/pull/6954#discussion_r412846457 ## File path: docs/source/developers/benchmarks.rst ## @@ -59,7 +59,7 @@ Sometimes, it is required to pass custom CMake flags, e.g. .. code-block:: shell export CC=clang-8 CXX=clang++8 - archery benchmark run --cmake-extras="-DARROW_USE_SIMD=ON" + archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE" Review comment: Get it, then ARROW_SIMD_LEVEL=NONE is fine:) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns
nevi-me commented on issue #7009: URL: https://github.com/apache/arrow/pull/7009#issuecomment-617678597 > looks like the windows CI is failing with error not related to my change: > > ``` > "error: \'rustfmt.exe\' is not installed for the toolchain \'nightly-2019-11-14-x86_64-pc-windows-msvc\'\nTo install > ``` This will be fixed by #7010. May you please kindly rebase once it's merged This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me opened a new pull request #7010: [Rust] [CI]: fix rustfmt failures
nevi-me opened a new pull request #7010: URL: https://github.com/apache/arrow/pull/7010 This adds the `rustfmt` component to the Rust installations in Windows and MacOS, and fixes `rustfmt` related CI failures. @kszucs @paddyhoran This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on issue #7004: ARROW-3827: [Rust] Implement UnionArray Updated
nevi-me commented on issue #7004: URL: https://github.com/apache/arrow/pull/7004#issuecomment-617676072 > @kszucs it's failing due to `rustfmt` not being installed before testing the flight crate, any idea why this would be the case? Sorry, I don't know much about GitHub actions yet... This change fixes it (https://github.com/apache/arrow/pull/6306/commits/f0369b72e164edd86d8f9e6a69cf2cbf44bb6d18). I don't understand why the same toolchain has previously worked, but now we're getting spurious failures in different PRs. I'll open a separate PR to fix it This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] cyb70289 commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files
cyb70289 commented on a change in pull request #6954: URL: https://github.com/apache/arrow/pull/6954#discussion_r412836834 ## File path: docs/source/developers/benchmarks.rst ## @@ -59,7 +59,7 @@ Sometimes, it is required to pass custom CMake flags, e.g. .. code-block:: shell export CC=clang-8 CXX=clang++8 - archery benchmark run --cmake-extras="-DARROW_USE_SIMD=ON" + archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE" Review comment: [The context](https://arrow.apache.org/docs/developers/benchmarks.html#running-the-benchmark-suite) is to show how to pass extra cmake flags. Default value looks not very useful. Maybe change to AVX2? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kou commented on issue #7008: ARROW-8551: [CI][Gandiva] Use LLVM 8 in gandiva linux build
kou commented on issue #7008: URL: https://github.com/apache/arrow/pull/7008#issuecomment-617618030 Can we move the Docker image for building Gandiva on Linux to our `ci/docker/` like https://github.com/apache/arrow/blob/master/python/manylinux201x/Dockerfile-x86_64_base_2014 ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on issue #7008: ARROW-8551: [CI][Gandiva] Use LLVM 8 in gandiva linux build
github-actions[bot] commented on issue #7008: URL: https://github.com/apache/arrow/pull/7008#issuecomment-617616494 Revision: 1e235ddc11ff6ee4620b62e3b5f9a318d117512b Submitted crossbow builds: [ursa-labs/crossbow @ actions-162](https://github.com/ursa-labs/crossbow/branches/all?query=actions-162) |Task|Status| ||--| |gandiva-jar-xenial|[![TravisCI](https://img.shields.io/travis/ursa-labs/crossbow/actions-162-travis-gandiva-jar-xenial.svg)](https://travis-ci.org/ursa-labs/crossbow/branches)| This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kou commented on issue #7008: ARROW-8551: [CI][Gandiva] Use LLVM 8 in gandiva linux build
kou commented on issue #7008: URL: https://github.com/apache/arrow/pull/7008#issuecomment-617615843 @github-actions crossbow submit gandiva-jar-xenial This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance
tustvold commented on a change in pull request #6980: URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972 ## File path: rust/arrow/src/array/builder.rs ## @@ -236,6 +251,14 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(v.to_byte_slice(), 1) } +default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> { +self.reserve(n)?; +for _ in 0..n { +self.write_bytes(v.to_byte_slice(), 1)?; +} Review comment: That would be incorrect though? The method should write the value `v` `n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and then increase the recorded element count by n, which would leave things in an odd state. For the general case there is little reason for this method to exist, aside from reserving memory once. It mainly exists because the bitmap specialization can more efficiently append the same value multiple times than a naive loop such as this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance
tustvold commented on a change in pull request #6980: URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972 ## File path: rust/arrow/src/array/builder.rs ## @@ -236,6 +251,14 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(v.to_byte_slice(), 1) } +default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> { +self.reserve(n)?; +for _ in 0..n { +self.write_bytes(v.to_byte_slice(), 1)?; +} Review comment: That would be incorrect though? The method should write the value `v` `n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and then increase the recorded element count by n, which would leave things in an odd state. For the general case there is little reason for this method to exist, aside from reserving memory once. It mainly exists because the bitmap specialization can more efficiently append the same value multiple times than a naive loop such as this. Edit: Checked again and you're completely correct, there is a type specialization for the buffer writer - will fix This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance
tustvold commented on a change in pull request #6980: URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972 ## File path: rust/arrow/src/array/builder.rs ## @@ -236,6 +251,14 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(v.to_byte_slice(), 1) } +default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> { +self.reserve(n)?; +for _ in 0..n { +self.write_bytes(v.to_byte_slice(), 1)?; +} Review comment: That would be incorrect though? The method should write the value `v` `n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and then increase the recorded element count by n, which would leave things in an odd state. For the general case there is little reason for this method to exist, aside from reserving memory once. It mainly exists because the bitmap specialization can more efficiently append the same value multiple times than a naive loop such as this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance
tustvold commented on a change in pull request #6980: URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972 ## File path: rust/arrow/src/array/builder.rs ## @@ -236,6 +251,14 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(v.to_byte_slice(), 1) } +default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> { +self.reserve(n)?; +for _ in 0..n { +self.write_bytes(v.to_byte_slice(), 1)?; +} Review comment: That would be incorrect though? The method should write the value `v` `n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and then increase the recorded element count by n, which would leave things in an odd state. For the general case there is no reason for this method to exist, it purely exists because the bitmap specialization can more efficiently append the same value multiple times than a naive loop such as this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance
tustvold commented on a change in pull request #6980: URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972 ## File path: rust/arrow/src/array/builder.rs ## @@ -236,6 +251,14 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(v.to_byte_slice(), 1) } +default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> { +self.reserve(n)?; +for _ in 0..n { +self.write_bytes(v.to_byte_slice(), 1)?; +} Review comment: That would be incorrect though? The method should write the value `v` `n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and then update the recorded element count by n, which would leave things in an odd state. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance
tustvold commented on a change in pull request #6980: URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972 ## File path: rust/arrow/src/array/builder.rs ## @@ -236,6 +251,14 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(v.to_byte_slice(), 1) } +default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> { +self.reserve(n)?; +for _ in 0..n { +self.write_bytes(v.to_byte_slice(), 1)?; +} Review comment: That would be incorrect though? The method should write the value `v` `n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and then update the array length by n, which would leave things in an odd state. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] houqp commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns
houqp commented on issue #7009: URL: https://github.com/apache/arrow/pull/7009#issuecomment-617600824 looks like the windows CI is failing with error not related to my change: ``` "error: \'rustfmt.exe\' is not installed for the toolchain \'nightly-2019-11-14-x86_64-pc-windows-msvc\'\nTo install ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org