[GitHub] [arrow] kiszk commented on pull request #7101: ARROW-8695: [Java] Remove references to PlatformDependent in arrow-memory
kiszk commented on pull request #7101: URL: https://github.com/apache/arrow/pull/7101#issuecomment-623868378 Looks good to me This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7103: ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
emkornfield commented on a change in pull request #7103: URL: https://github.com/apache/arrow/pull/7103#discussion_r419875325 ## File path: cpp/src/parquet/thrift_internal.h ## @@ -362,7 +362,7 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, new ThriftBuffer(const_cast(buf), *len)); apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; // Protect against CPU and memory bombs - tproto_factory.setStringSizeLimit(10 * 1000 * 1000); + tproto_factory.setStringSizeLimit(100 * 1000 * 1000); Review comment: could this be added easily to reader properties? If not 100 MB seems like a good amount of buffer, but in case people really want to abuse it, it would be nice not to get follow-up bug reports. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419855848 ## File path: cpp/src/arrow/util/bit_util.h ## @@ -610,6 +618,71 @@ class FirstTimeBitmapWriter { } } + /// Appends number_of_bits from word to valid_bits and valid_bits_offset. + /// + /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed + ///to be unset (i.e. 0). + /// \param[in] number_of_bits The number of bits to append from word. + void AppendWord(uint64_t word, int64_t number_of_bits) { +#if defined(ARROW_LITTLE_ENDIAN) +if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { + return; +} + +// Location that the first byte needs to be written to. +uint8_t* append_position = bitmap_ + byte_offset_; + +// Update state variables except for current_byte_ here. +position_ += number_of_bits; +int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast(bit_mask_)); +bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8]; +byte_offset_ += (bit_offset + number_of_bits) / 8; + +if (bit_offset != 0) { + // We are in the middle of the byte. This code updates the byte and shifts + // bits appropriately within word so it can be memcpy'd below. + int64_t bits_to_carry = 8 - bit_offset; + // Carry over bits from word to current_byte_. We assume any extra bits in word + // unset so no additional accounting is needed for when number_of_bits < + // bits_to_carry. + current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) << bit_offset; + // Check if everything is transfered into current_byte_. + if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { +return; + } + *append_position = current_byte_; + append_position++; + // Move the carry bits off of word. + word = word >> bits_to_carry; + number_of_bits -= bits_to_carry; +} + +int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits); +std::memcpy(append_position, , bytes_for_word); +// At this point, the previous current_byte_ has been written to bitmap_. +// The new current_byte_ is either the last relevant byte in 'word' +// or cleared if the new position is byte aligned (i.e. a fresh byte). +if (bit_mask_ == 0x1) { + current_byte_ = 0; +} else { + // current_byte_ = *(reinterpret_cast() + bytes_for_word - 1) : 0; + current_byte_ = *(append_position + bytes_for_word - 1); +} + +#else // big-endian +for (int x = 0; x < number_of_bits; x++) { Review comment: maybe I should just remove this since I can't verify? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419855634 ## File path: cpp/src/arrow/util/bit_util.h ## @@ -610,6 +618,103 @@ class FirstTimeBitmapWriter { } } + /// Appends number_of_bits from word to valid_bits and valid_bits_offset. + /// + /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed + ///to be unset (i.e. 0). + /// \param[in] number_of_bits The number of bits to append from word. + void AppendWord(uint64_t word, int64_t number_of_bits) { +#if defined(ARROW_LITTLE_ENDIAN) +// Selection masks to retrieve all low order bits for each byte in word. +constexpr uint64_t kLsbSelectionMasks[] = { +0, // unused. +0x0101010101010101, +0x0303030303030303, +0x0707070707070707, +0x0F0F0F0F0F0F0F0F, +0x1F1F1F1F1F1F1F1F, +0x3F3F3F3F3F3F3F3F, +0x7F7F7F7F7F7F7F7F, +}; +if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { + return; +} + +// Location that the first byte needs to be written to. +uint8_t* append_position = bitmap_ + byte_offset_; + +// Update state variables except for current_byte_ here. +position_ += number_of_bits; +int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast(bit_mask_)); +bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8]; +byte_offset_ += (bit_offset + number_of_bits) / 8; + +if (bit_offset != 0) { + // We are in the middle of the byte. This code updates the byte and shifts + // bits appropriately within word so it can be memcpy'd below. + int64_t bits_to_carry = 8 - bit_offset; + // Get the mask that will select the least significant bit (the ones to carry + // over to the current_byte_ and shift up). + const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; + // Mask to select non-carried bits. + const uint64_t non_carry_mask = ~carry_mask; + + // Carry over bits from word to current_byte_. We assume any extra bits in word + // unset so no additional accounting is needed for when number_of_bits < + // bits_to_carry. + current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset); + // Check if everything is transfered into current_byte_. + if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { +return; + } + *append_position = current_byte_; + append_position++; + + // We illustrate the logic with a 3-byte example in little-endian/LSB order + // ('N' indicates a not set bit, 'Y' indicates a set bit). + // Note this ordering is reversed from HEX masks above with are expressed + // big-endian/MSB and shifts right move the bits to the left. + // The original bit positions are: + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + // Assuming a bit-offset of 6 non_carry_mask is: + // N N Y Y Y Y Y Y N N Y Y Y Y Y YN N Y Y Y Y Y Y + // shifted_word is: + // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N + uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry; + // captured_carry is: + // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N + uint64_t captured_carry = carry_mask & word; + // mask_carry_bits is: + // N N N N N N 8 9 N N N N N N 16 17N N N N N N N N + uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; Review comment: yes, I confused myself initially on LSB/little-endian and never recovered from that. removed a bunch of code from this. Thank you. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419855248 ## File path: cpp/src/parquet/level_conversion.cc ## @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "parquet/level_conversion.h" + +#include +#include +#if defined(ARROW_HAVE_BMI2) +#include +#endif + +#include "arrow/util/bit_util.h" + +#include "parquet/exception.h" + +namespace parquet { +namespace internal { +namespace { +inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, +const int16_t max_expected_level) { + int16_t min_level = std::numeric_limits::max(); + int16_t max_level = std::numeric_limits::min(); + for (int x = 0; x < num_levels; x++) { +min_level = std::min(levels[x], min_level); +max_level = std::max(levels[x], max_level); + } + if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_level))) { +throw ParquetException("definition level exceeds maximum"); + } +} + +#if !defined(ARROW_HAVE_BMI2) + +inline void DefinitionLevelsToBitmapScalar( +const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, +const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, +uint8_t* valid_bits, int64_t valid_bits_offset) { + // We assume here that valid_bits is large enough to accommodate the + // additional definition levels and the ones that have already been written + ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, +num_def_levels); + + // TODO(itaiin): As an interim solution we are splitting the code path here + // between repeated+flat column reads, and non-repeated+nested reads. + // Those paths need to be merged in the future Review comment: Yes it is. It is a behaviour issue. hopefully i'll be addressing this over the next few PRs. This was a first step. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419855459 ## File path: cpp/src/arrow/util/bit_util.h ## @@ -610,6 +618,103 @@ class FirstTimeBitmapWriter { } } + /// Appends number_of_bits from word to valid_bits and valid_bits_offset. + /// + /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed + ///to be unset (i.e. 0). + /// \param[in] number_of_bits The number of bits to append from word. + void AppendWord(uint64_t word, int64_t number_of_bits) { +#if defined(ARROW_LITTLE_ENDIAN) +// Selection masks to retrieve all low order bits for each byte in word. +constexpr uint64_t kLsbSelectionMasks[] = { +0, // unused. +0x0101010101010101, +0x0303030303030303, +0x0707070707070707, +0x0F0F0F0F0F0F0F0F, +0x1F1F1F1F1F1F1F1F, +0x3F3F3F3F3F3F3F3F, +0x7F7F7F7F7F7F7F7F, +}; +if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { + return; +} + +// Location that the first byte needs to be written to. +uint8_t* append_position = bitmap_ + byte_offset_; + +// Update state variables except for current_byte_ here. +position_ += number_of_bits; +int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast(bit_mask_)); +bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8]; +byte_offset_ += (bit_offset + number_of_bits) / 8; + +if (bit_offset != 0) { + // We are in the middle of the byte. This code updates the byte and shifts + // bits appropriately within word so it can be memcpy'd below. + int64_t bits_to_carry = 8 - bit_offset; + // Get the mask that will select the least significant bit (the ones to carry + // over to the current_byte_ and shift up). + const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; + // Mask to select non-carried bits. + const uint64_t non_carry_mask = ~carry_mask; + + // Carry over bits from word to current_byte_. We assume any extra bits in word + // unset so no additional accounting is needed for when number_of_bits < + // bits_to_carry. + current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset); + // Check if everything is transfered into current_byte_. + if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { +return; + } + *append_position = current_byte_; + append_position++; + + // We illustrate the logic with a 3-byte example in little-endian/LSB order + // ('N' indicates a not set bit, 'Y' indicates a set bit). + // Note this ordering is reversed from HEX masks above with are expressed + // big-endian/MSB and shifts right move the bits to the left. + // The original bit positions are: + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + // Assuming a bit-offset of 6 non_carry_mask is: + // N N Y Y Y Y Y Y N N Y Y Y Y Y YN N Y Y Y Y Y Y + // shifted_word is: + // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N + uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry; + // captured_carry is: + // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N + uint64_t captured_carry = carry_mask & word; + // mask_carry_bits is: + // N N N N N N 8 9 N N N N N N 16 17N N N N N N N N + uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; + + word = shifted_word | mask_carry_bits; + number_of_bits -= bits_to_carry; +} + +int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits); +std::memcpy(append_position, , bytes_for_word); +// At this point, the previous current_byte_ has been written to bitmap_. +// The new current_byte_ is either the last relevant byte in 'word' +// or cleared if the new position is byte aligned (i.e. a fresh byte). +current_byte_ = +bit_mask_ != 0x1 ? *(reinterpret_cast() + bytes_for_word - 1) : 0; + +#else // big-endian +BitmapReader reader(reinterpret_cast(), 0, /*length=*/number_of_bits); Review comment: That does look more correct. Without big-endian in CI its hard to confirm. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419855100 ## File path: cpp/src/arrow/util/bit_util_test.cc ## @@ -315,6 +318,115 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { } } +std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) { + return arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString(); +} + +std::string BitmapToString(const std::vector& bitmap, int64_t bit_count) { + return BitmapToString(bitmap.data(), bit_count); +} + +TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) { + auto check_append = [](const std::string& expected_bits, int64_t offset) { +std::vector valid_bits = {0x00}; +constexpr int64_t kBitsAfterAppend = 8; +internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, + /*length=*/(8 * valid_bits.size()) - offset); +writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend - offset); +writer.Finish(); +EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); + }; + check_append("", 0); + check_append("0111", 1); + check_append("0011", 2); + check_append("0001", 3); + check_append("", 4); + check_append("0111", 5); + check_append("0011", 6); + check_append("0001", 7); +} + +TEST(FirstTimeBitmapWriter, AppendZeroBitsHasNoImpact) { + std::vector valid_bits(/*count=*/1, 0); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, + /*length=*/valid_bits.size() * 8); + writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0); + writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0); + writer.AppendWord(/*word=*/0x01, /*number_of_bits=*/1); + writer.Finish(); + EXPECT_EQ(valid_bits[0], 0x2); +} + +TEST(FirstTimeBitmapWriter, AppendLessThenByte) { Review comment: fixed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
emkornfield commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419855029 ## File path: cpp/src/arrow/util/bit_util_test.cc ## @@ -315,6 +318,115 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { } } +std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) { + return arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString(); +} + +std::string BitmapToString(const std::vector& bitmap, int64_t bit_count) { + return BitmapToString(bitmap.data(), bit_count); +} + +TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) { + auto check_append = [](const std::string& expected_bits, int64_t offset) { +std::vector valid_bits = {0x00}; Review comment: I added variants to this test. Note that I don't think have a bitmap full of 111s iand overwriting trailing ones isn't really expected because any not processed are expected to be zero. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7089: ARROW-8657: [C++][Python] Add separate configuration for data pages
emkornfield commented on a change in pull request #7089: URL: https://github.com/apache/arrow/pull/7089#discussion_r419852845 ## File path: cpp/src/parquet/properties.h ## @@ -34,10 +34,14 @@ namespace parquet { +/// Control for data types in parquet. struct ParquetVersion { enum type { PARQUET_1_0, PARQUET_2_0 }; }; +/// Controls layout of data pages. Review comment: Added Wes's comment. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on pull request #6954: ARROW-8440: [C++] Refine SIMD header files
emkornfield commented on pull request #6954: URL: https://github.com/apache/arrow/pull/6954#issuecomment-623846486 Nothing more from me This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jianxind commented on pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
jianxind commented on pull request #7029: URL: https://github.com/apache/arrow/pull/7029#issuecomment-623845147 > A general question: why is this limited to `sizeof(T) == 4` and `sizeof(T) == 8`? There are 8-bit and 16-bit types as well. _mm512_mask_expand_epi16/_mm512_mask_expand_epi18 are based on AVX512_VBMI2 support which is a new feature started from icelake architecture. So for 8-bit, current skylake AVX512 need fall back to SSE path. And dose arrow has 16bit type data? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm edited a comment on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl
wesm edited a comment on pull request #7088: URL: https://github.com/apache/arrow/pull/7088#issuecomment-623788547 Turns out Gandiva already had a `strptime` wrapper (that was ~10+x faster than vendored/datetime.h) so wasn't that hard to adapt it. This is ready to merge pending CI This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl
wesm commented on pull request #7088: URL: https://github.com/apache/arrow/pull/7088#issuecomment-623788547 Turns out Gandiva already had a `strptime` wrapper so wasn't that hard to adapt it. This is ready to merge pending CI This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #7058: Offline installation on Linux
wesm commented on issue #7058: URL: https://github.com/apache/arrow/issues/7058#issuecomment-623774020 Closing. Please feel free to follow up on JIRA This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl
wesm commented on a change in pull request #7088: URL: https://github.com/apache/arrow/pull/7088#discussion_r419799490 ## File path: cpp/src/arrow/csv/converter_benchmark.cc ## @@ -20,15 +20,67 @@ #include #include +#include "arrow/buffer.h" #include "arrow/csv/converter.h" #include "arrow/csv/options.h" #include "arrow/csv/parser.h" +#include "arrow/csv/reader.h" #include "arrow/csv/test_common.h" +#include "arrow/io/memory.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/value_parsing.h" namespace arrow { namespace csv { +static std::shared_ptr GenerateTimestampsCSV( +const std::vector& dates, int32_t cols, int32_t rows) { + std::stringstream ss; + for (int32_t row = 0; row < rows; ++row) { +for (int32_t col = 0; col < cols; ++col) { + ss << dates[rand() % dates.size()]; +} +ss << "\n"; + } + return Buffer::FromString(ss.str()); +} + +static Status ReadCSV(const Buffer& data, const csv::ConvertOptions& convert_opt) { + ARROW_ASSIGN_OR_RAISE( + auto table_reader, + csv::TableReader::Make( + default_memory_pool(), std::make_shared(data), + csv::ReadOptions::Defaults(), csv::ParseOptions::Defaults(), convert_opt)); + return table_reader->Read().status(); +} + +const std::vector kExampleDates = { +"1917-10-17,", "2018-09-13 22,", "1941-06-22 04:00,", "1945-05-09 09:45:38,"}; +constexpr int32_t kNumRows = 1000; +constexpr int32_t kNumCols = 10; + +static void ConvertTimestampVirtualISO8601(benchmark::State& state) { + auto data = GenerateTimestampsCSV(kExampleDates, kNumCols, kNumRows); + auto convert_options = csv::ConvertOptions::Defaults(); + convert_options.timestamp_parsers.push_back(TimestampParser::MakeISO8601()); + for (auto _ : state) { +ABORT_NOT_OK(ReadCSV(*data, convert_options)); Review comment: sorry I missed this, done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl
wesm commented on a change in pull request #7088: URL: https://github.com/apache/arrow/pull/7088#discussion_r419792382 ## File path: cpp/src/arrow/util/value_parsing.cc ## @@ -79,5 +86,46 @@ bool StringToFloatConverter::StringToFloat(const char* s, size_t length, double* return true; } +// -- +// strptime-like parsing + +class StrptimeTimestampParser : public TimestampParser { + public: + explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {} + + bool operator()(const char* s, size_t length, TimeUnit::type out_unit, + value_type* out) const override { +arrow_vendored::date::sys_seconds time_point; +std::istringstream stream(std::string(s, length)); +if (stream >> arrow_vendored::date::parse(format_, time_point)) { Review comment: This won't work anyway because HowardHinnant/date doesn't completely work with gcc < 7 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on pull request #7096: ARROW-8644: [Python] Restore ParquetDataset behaviour to always include partition column for dask compatibility
wesm commented on pull request #7096: URL: https://github.com/apache/arrow/pull/7096#issuecomment-623764101 This is a regression? If so can you mark it with 0.17.1 (and 1.0.0) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7103: ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
github-actions[bot] commented on pull request #7103: URL: https://github.com/apache/arrow/pull/7103#issuecomment-623757231 https://issues.apache.org/jira/browse/ARROW-8694 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm opened a new pull request #7103: ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
wesm opened a new pull request #7103: URL: https://github.com/apache/arrow/pull/7103 While it's not an ideal use case for Parquet, the 10MB limit for strings was causing a Thrift deserialization failure due to the large "pandas metadata" JSON blob written with the Schema when there are many columns. A 100MB limit should still catch "memory bombs" caused by nefarious input while allowing pretty wide data frames to be stored successfully This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kevinushey commented on pull request #7102: ARROW-8699: [R] Fix automatic r_to_py conversion
kevinushey commented on pull request #7102: URL: https://github.com/apache/arrow/pull/7102#issuecomment-623743441 LGTM! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on a change in pull request #6959: ARROW-5649: [Integration][C++] Create integration test for extension types
wesm commented on a change in pull request #6959: URL: https://github.com/apache/arrow/pull/6959#discussion_r419765783 ## File path: cpp/src/arrow/util/key_value_metadata.cc ## @@ -94,11 +94,42 @@ Result KeyValueMetadata::Get(const std::string& key) const { } } +Status KeyValueMetadata::Delete(int64_t index) { + keys_.erase(keys_.begin() + index); + values_.erase(values_.begin() + index); + return Status::OK(); +} + +Status KeyValueMetadata::DeleteMany(std::vector indices) { + std::sort(indices.begin(), indices.end()); + const int64_t size = static_cast(keys_.size()); + indices.push_back(size); + + int64_t shift = 0; + for (int64_t i = 0; i < static_cast(indices.size() - 1); ++i) { +++shift; +const auto start = indices[i] + 1; +const auto stop = indices[i + 1]; +DCHECK_GE(start, 0); +DCHECK_LE(start, size); +DCHECK_GE(stop, 0); +DCHECK_LE(stop, size); +for (int64_t index = start; index < stop; ++index) { + keys_[index - shift] = std::move(keys_[index]); + values_[index - shift] = std::move(values_[index]); +} + } + keys_.resize(size - shift); + values_.resize(size - shift); + return Status::OK(); +} + Status KeyValueMetadata::Delete(const std::string& key) { auto index = FindKey(key); if (index < 0) { return Status::KeyError(key); } else { +return Delete(index); keys_.erase(keys_.begin() + index); values_.erase(values_.begin() + index); return Status::OK(); Review comment: These lines will never execute ## File path: cpp/src/parquet/arrow/reader_internal.cc ## @@ -634,30 +639,23 @@ Status ApplyOriginalMetadata(std::shared_ptr field, const Field& origin_f field = field->WithType( ::arrow::dictionary(::arrow::int32(), field->type(), dict_origin_type.ordered())); } - // restore field metadata + + if (origin_type->id() == ::arrow::Type::EXTENSION) { +// Restore extension type, if the storage type is as read from Parquet +const auto& ex_type = checked_cast(*origin_type); +if (ex_type.storage_type()->Equals(*field->type())) { + field = field->WithType(origin_type); +} + } + + // Restore field metadata std::shared_ptr field_metadata = origin_field.metadata(); if (field_metadata != nullptr) { if (field->metadata()) { // Prefer the metadata keys (like field_id) from the current metadata field_metadata = field_metadata->Merge(*field->metadata()); } field = field->WithMetadata(field_metadata); - -// extension type -int name_index = field_metadata->FindKey(::arrow::kExtensionTypeKeyName); -if (name_index != -1) { - std::string type_name = field_metadata->value(name_index); - int data_index = field_metadata->FindKey(::arrow::kExtensionMetadataKeyName); - std::string type_data = data_index == -1 ? "" : field_metadata->value(data_index); - - std::shared_ptr<::arrow::ExtensionType> ext_type = - ::arrow::GetExtensionType(type_name); - if (ext_type != nullptr) { -ARROW_ASSIGN_OR_RAISE(auto deserialized, - ext_type->Deserialize(field->type(), type_data)); -field = field->WithType(deserialized); - } -} Review comment: Nice This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] yordan-pavlov commented on pull request #7037: ARROW-6718: [DRAFT] [Rust] Remove packed_simd
yordan-pavlov commented on pull request #7037: URL: https://github.com/apache/arrow/pull/7037#issuecomment-623718293 hi @nevi-me , I have just published the filtering benchmark here: https://github.com/yordan-pavlov/arrow-benchmark/blob/master/rust/arrow_benchmark/src/main.rs I hope it will be useful in benchmarking and improving arrow performance. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson opened a new pull request #7102: ARROW-8699: [R] Fix automatic r_to_py conversion
nealrichardson opened a new pull request #7102: URL: https://github.com/apache/arrow/pull/7102 This appears to be the fix for https://github.com/rstudio/reticulate/issues/748 cc @kevinushey This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
nealrichardson commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419722389 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,224 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +``archery`` requires ``python>=3.5``. It is recommended to install archery in +``editable`` mode with the ``-e`` flag to automatically update the intallation +by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +For the available commands and options invoke the installed archery commands +with the ``--help`` flag: + +.. code:: bash + +archery docker --help +archery docker run --help + + +Examples + + +**List the available images:** + +.. code:: bash + +archery docker images + +**Execute a build:** + +.. code:: bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code:: bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +**Show the docker-compose commands instead of executing them:** + +.. code:: bash + +archery docker run --dry-run conda-python + +**To disable the image pulling:** + +.. code:: bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code:: bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +**To disable the cache only for the leaf image:** + +Useful to force building the development version of a dependency. +In case of the example below the command builds the +``conda-cpp > conda-python > conda-python-pandas`` branch of the image tree +where the leaf image is ``conda-python-pandas``. + +.. code:: bash + +PANDAS=master archery docker run --no-cache-leaf conda-python-pandas + +Which translates to: + +.. code:: bash + +export PANDAS=master +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose build --no-cache conda-python-pandas +docker-compose run --rm conda-python-pandas + +Note that it doesn't pull the conda-python-pandas image and disable the cache +when building it. + +``PANDAS`` is a `build parameter `_, see the +defaults in the .env file. + +**To entirely skip building the image:** + +The layer caching mechanism of docker-compose is less reliable than docker's +depending on the version, ``cache_from`` build entry and the used backend +(docker-py, docker-cli, docker-cli and buildkit). This can lead to different +layer hashes - even when executing the same build command repeatedly - +eventually causing cache misses full image rebuilds. + +If the image has been already built but the cache doesn't work properly, it can +be useful to skip the build phases: + +.. code:: bash + +archery docker run --no-build conda-python + +**Pass environment variables to the container:** + +Most of the build scripts used within the containers can be configured through +environment variables. Pass them using ``--env`` or ``-e`` CLI options - +similarly to ``docker run`` and ``docker-compose run`` interface. Review comment: ```suggestion similar to the ``docker run`` and ``docker-compose run`` interface. ``` ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,224 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +``archery`` requires ``python>=3.5``. It is recommended to install archery in +``editable`` mode with the ``-e`` flag to automatically update the intallation +by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +For the available commands and options invoke the installed archery commands +with the ``--help`` flag: + +.. code:: bash + +archery docker --help +archery docker run --help + + +Examples + + +**List the available images:** + +.. code:: bash + +archery docker images +
[GitHub] [arrow] kszucs commented on pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on pull request #7021: URL: https://github.com/apache/arrow/pull/7021#issuecomment-623695509 @github-actions crossbow submit -g test This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419709631 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to +``docker-compose run``, but with builtin support for building hierarchical +images. + +Examples + + +**Execute a build:** + +.. code:: bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code:: bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +**To disable the image pulling:** + +.. code:: bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code:: bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +**To disable the cache only for the leaf image:** + +Useful to force building the development version of a dependency. +The leaf image is ``conda-python-pandas`` in the example. + +.. code:: bash + +PANDAS=master archery docker run --no-cache-leaf conda-python-pandas + +Which translates to: + +.. code:: bash + +export PANDAS=master +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose build --no-cache conda-python-pandas +docker-compose run --rm conda-python-pandas + Review comment: Added. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on pull request #7021: URL: https://github.com/apache/arrow/pull/7021#issuecomment-623684032 > Some UX improvement I'd like to see: > > * I often fail to run `docker-compose` from the root of the sources leading to cryptic errors because it can't find the `.env`. Could you automatically fix this with a setcwd in archery? This is a nice UX feature for user who don't know this detail. `archery docker run` now works from any subdirectory of arrow, even when it is installed in non-editable mode. The script handles the reading of dotenv defaults for better integration, so we always override those values. > * It would be nice if upon failure, we'd get the list of environment variables changes affecting the docker build, e.g. PANDAS=master... printed somewhere such that we can easily reproduce. Added. > * Add a `archery docker list` command so we don't need to look at docker-compose.yml to get the list of available targets. Bonus point if you add a short description Added and the bonus is deferred to https://issues.apache.org/jira/browse/ARROW-8697 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7101: ARROW-8695: [Java] Remove references to PlatformDependent in arrow-memory
github-actions[bot] commented on pull request #7101: URL: https://github.com/apache/arrow/pull/7101#issuecomment-623682441 https://issues.apache.org/jira/browse/ARROW-8695 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7100: [Java] ARROW-8696: Convert tests to maven failsafe
github-actions[bot] commented on pull request #7100: URL: https://github.com/apache/arrow/pull/7100#issuecomment-623675865 Thanks for opening a pull request! Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW Then could you also rename pull request title in the following format? ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} See also: * [Other pull requests](https://github.com/apache/arrow/pulls/) * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7101: [Java] ARROW-8695: Remove references to PlatformDependent in arrow-memory
github-actions[bot] commented on pull request #7101: URL: https://github.com/apache/arrow/pull/7101#issuecomment-623675866 Thanks for opening a pull request! Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW Then could you also rename pull request title in the following format? ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} See also: * [Other pull requests](https://github.com/apache/arrow/pulls/) * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr opened a new pull request #7101: [Java] ARROW-8695: Remove references to PlatformDependent in arrow-memory
rymurr opened a new pull request #7101: URL: https://github.com/apache/arrow/pull/7101 As part of ARROW-8230 we are reducing the usages of Netty inside the arrow-memory module. This step simply removes Netty utils references This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr opened a new pull request #7100: [Java] ARROW-8696: Convert tests to maven failsafe
rymurr opened a new pull request #7100: URL: https://github.com/apache/arrow/pull/7100 Some tests are run via main() and can be run as integration tests instead. This makes running as part of an automated job easier. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm edited a comment on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
wesm edited a comment on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623666418 S3 benchmarks run outside of EC2 aren't likely to be (too) useful This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
wesm commented on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623666418 S3 benchmarks run outside of EC2 aren't likely to be useful This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
fsaintjacques commented on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623657503 Locally: ``` # Before $ time cpp/build/conda-release/release/dataset-parquet-scan-example 's3://123:12345678@nyc-tlc/parquet?scheme=http_override=localhost:9000' Table size: 53747 real0m6.917s user0m49.500s sys 0m3.758s # After $ time cpp/build/conda-release/release/dataset-parquet-scan-example 's3://123:12345678@nyc-tlc/parquet?scheme=http_override=localhost:9000' Table size: 53747 real0m6.456s user0m45.141s sys 0m3.322s ``` A small improvement. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] lidavidm edited a comment on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
lidavidm edited a comment on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623645720 Ok, I ran the benchmarks against S3 several times, but performance is wildly inconsistent. This is from an EC2 VM to S3 in the same region. Before: ``` -- Benchmark Time CPU Iterations -- MinioFixture/ReadAll500Mib/real_time 5850095808 ns 1881569619 ns 7 85.4687MB/s 0.170937 items/s MinioFixture/ReadChunked500Mib/real_time 7583846744 ns 1568950938 ns 6 65.9296MB/s 0.131859 items/s MinioFixture/ReadCoalesced500Mib/real_time 5935405783 ns 592848 ns 7 84.2402MB/s0.16848 items/s ``` After: ``` -- Benchmark Time CPU Iterations -- MinioFixture/ReadAll500Mib/real_time 10612223830 ns 2214309641 ns 6 47.1155MB/s 0.094231 items/s MinioFixture/ReadChunked500Mib/real_time 17048801064 ns 3879733068 ns 2 29.3276MB/s 0.0586552 items/s MinioFixture/ReadCoalesced500Mib/real_time 17039251080 ns 655276 ns 229.344MB/s 0.058688 items/s -- Benchmark Time CPU Iterations -- MinioFixture/ReadAll500Mib/real_time 5867569374 ns 1152395630 ns 4 85.2142MB/s 0.170428 items/s MinioFixture/ReadChunked500Mib/real_time 6496429473 ns 1172657713 ns 3 76.9654MB/s 0.153931 items/s MinioFixture/ReadCoalesced500Mib/real_time 4892376030 ns 575236 ns 4 102.2MB/s 0.2044 items/s ``` I think S3 performance is too variable/not high enough for this optimization to be noticeable, at least in this context :/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] lidavidm commented on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
lidavidm commented on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623645720 Ok, I ran the benchmarks against S3 several times, but performance is wildly inconsistent. Before: ``` -- Benchmark Time CPU Iterations -- MinioFixture/ReadAll500Mib/real_time 5850095808 ns 1881569619 ns 7 85.4687MB/s 0.170937 items/s MinioFixture/ReadChunked500Mib/real_time 7583846744 ns 1568950938 ns 6 65.9296MB/s 0.131859 items/s MinioFixture/ReadCoalesced500Mib/real_time 5935405783 ns 592848 ns 7 84.2402MB/s0.16848 items/s ``` After: ``` -- Benchmark Time CPU Iterations -- MinioFixture/ReadAll500Mib/real_time 10612223830 ns 2214309641 ns 6 47.1155MB/s 0.094231 items/s MinioFixture/ReadChunked500Mib/real_time 17048801064 ns 3879733068 ns 2 29.3276MB/s 0.0586552 items/s MinioFixture/ReadCoalesced500Mib/real_time 17039251080 ns 655276 ns 229.344MB/s 0.058688 items/s -- Benchmark Time CPU Iterations -- MinioFixture/ReadAll500Mib/real_time 5867569374 ns 1152395630 ns 4 85.2142MB/s 0.170428 items/s MinioFixture/ReadChunked500Mib/real_time 6496429473 ns 1172657713 ns 3 76.9654MB/s 0.153931 items/s MinioFixture/ReadCoalesced500Mib/real_time 4892376030 ns 575236 ns 4 102.2MB/s 0.2044 items/s ``` I think S3 performance is too variable/not high enough for this optimization to be noticeable, at least in this context :/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] paddyhoran commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
paddyhoran commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r419658264 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values +/// are used to index into the `child_arrays`. +/// +/// The `value_offsets` `Buffer` is only provided in the case of a dense union, sparse unions +/// should use `None`. If provided the `value_offsets` `Buffer` should contain `i32` values. +/// These values should be greater than zero and must be less than the length of the overall +/// array. +/// +/// In both cases above we use signed integer types to maintain compatibility with other +/// Arrow implementations. +/// +/// In both of the
[GitHub] [arrow] paddyhoran commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
paddyhoran commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r419657778 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values +/// are used to index into the `child_arrays`. +/// +/// The `value_offsets` `Buffer` is only provided in the case of a dense union, sparse unions +/// should use `None`. If provided the `value_offsets` `Buffer` should contain `i32` values. +/// These values should be greater than zero and must be less than the length of the overall +/// array. +/// +/// In both cases above we use signed integer types to maintain compatibility with other +/// Arrow implementations. +/// +/// In both of the
[GitHub] [arrow] paddyhoran commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated
paddyhoran commented on a change in pull request #7004: URL: https://github.com/apache/arrow/pull/7004#discussion_r419653824 ## File path: rust/arrow/src/array/union.rs ## @@ -0,0 +1,1174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains the `UnionArray` and `UnionBuilder` types. +//! +//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the +//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). +//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information +//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). +//! +//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested +//! types are also supported but not via `UnionBuilder`, see the tests for examples. +//! +//! # Example: Dense Memory Layout +//! +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_dense(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 0_i32); +//! assert_eq!(union.value_offset(2), 1_i32); +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Sparse Memory Layout +//! ``` +//! use arrow::array::UnionBuilder; +//! use arrow::datatypes::{Float64Type, Int32Type}; +//! +//! # fn main() -> arrow::error::Result<()> { +//! let mut builder = UnionBuilder::new_sparse(3); +//! builder.append::("a", 1).unwrap(); +//! builder.append::("b", 3.0).unwrap(); +//! builder.append::("a", 4).unwrap(); +//! let union = builder.build().unwrap(); +//! +//! assert_eq!(union.type_id(0), 0_i8); +//! assert_eq!(union.type_id(1), 1_i8); +//! assert_eq!(union.type_id(2), 0_i8); +//! +//! assert_eq!(union.value_offset(0), 0_i32); +//! assert_eq!(union.value_offset(1), 1_i32); +//! assert_eq!(union.value_offset(2), 2_i32); +//! +//! # Ok(()) +//! # } +//! ``` +use crate::array::{ +builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, BufferBuilderTrait}, +make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef, +BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder, +}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use crate::util::bit_util; +use core::fmt; +use std::any::Any; +use std::collections::HashMap; +use std::mem::size_of; + +/// An Array that can represent slots of varying types +pub struct UnionArray { +data: ArrayDataRef, +boxed_fields: Vec, +} + +impl UnionArray { +/// Creates a new `UnionArray`. +/// +/// Accepts type ids, child arrays and optionally offsets (for dense unions) to create +/// a new `UnionArray`. This method makes no attempt to validate the data provided by the +/// caller and assumes that each of the components are correct and consistent with each other. +/// See `try_new` for an alternative that validates the data provided. +/// +/// # Data Consistency +/// +/// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than +/// zero and must be less than the number of children provided in `child_arrays`. These values +/// are used to index into the `child_arrays`. +/// +/// The `value_offsets` `Buffer` is only provided in the case of a dense union, sparse unions +/// should use `None`. If provided the `value_offsets` `Buffer` should contain `i32` values. +/// These values should be greater than zero and must be less than the length of the overall +/// array. +/// +/// In both cases above we use signed integer types to maintain compatibility with other +/// Arrow implementations. +/// +/// In both of the
[GitHub] [arrow] jorisvandenbossche edited a comment on pull request #7096: ARROW-8644: [Python] Restore ParquetDataset behaviour to always include partition column for dask compatibility
jorisvandenbossche edited a comment on pull request #7096: URL: https://github.com/apache/arrow/pull/7096#issuecomment-623635439 So the question comes up if we actually should have the same behaviour in case of `use_legacy_dataset=False` (the `_ParquetDatasetV2` shim). For me, that depends a bit on what we want to do long term with `ParquetDataset`. If we want to keep it as "the" ParquetDataset (maybe becoming a subclass of the actual Dataset class then), then I think it should have the "correct" behaviour. If we only see it as a temporary vehicle to get people try it out / have poeple eventually use the `pyarrow.dataset` API, then it is less important to fix it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche edited a comment on pull request #7096: ARROW-8644: [Python] Restore ParquetDataset behaviour to always include partition column for dask compatibility
jorisvandenbossche edited a comment on pull request #7096: URL: https://github.com/apache/arrow/pull/7096#issuecomment-623635439 So the question comes up if we actually should also not revert the behaviour in case of `use_legacy_dataset=False` (the `_ParquetDatasetV2` shim). For me, that depends a bit on what we want to do long term with `ParquetDataset`. If we want to keep it as "the" ParquetDataset (maybe becoming a subclass of the actual Dataset class then), then I think it should have the "correct" behaviour. If we only see it as a temporary vehicle to get people try it out / have poeple eventually use the pyarrow.dataset API, then it is less important Joris Van den Bossche: Yep, will copy this comment over there This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419646876 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to +``docker-compose run``, but with builtin support for building hierarchical +images. + +Examples + + +**Execute a build:** + +.. code:: bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code:: bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +**To disable the image pulling:** + +.. code:: bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code:: bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +**To disable the cache only for the leaf image:** + +Useful to force building the development version of a dependency. +The leaf image is ``conda-python-pandas`` in the example. + +.. code:: bash + +PANDAS=master archery docker run --no-cache-leaf conda-python-pandas + +Which translates to: + +.. code:: bash + +export PANDAS=master +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose build --no-cache conda-python-pandas +docker-compose run --rm conda-python-pandas + +Note that it doens't pull the conda-python-pandas image and disable the cache +when building it. + +``PANDAS`` is a `build parameter `_, see the +defaults in the .env file. + +**To entirely skip building the image:** Review comment: Added explanation. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419646543 ## File path: docs/source/developers/integration.rst ## @@ -56,15 +56,19 @@ build mount is used for caching and sharing state between staged images. You can build and run a service by using the `build` and `run` docker-compose sub-command, e.g. `docker-compose build python && docker-compose run python`. We do not publish the build images, you need to build them manually. This -method requires the user to build the images in reverse dependency order. To -simplify this, we provide a Makefile. +method requires the user to build the images in reverse dependency order. .. code-block:: shell # Build and run manually - docker-compose build cpp - docker-compose build python - docker-compose run python + docker-compose build conda-cpp + docker-compose build conda-python + docker-compose run conda-python - # Using the makefile with proper image dependency resolution - make -f Makefile.docker python +To simplify this, Archery provides a command for it: + +.. code-block:: shell + + archery docker run conda-python + +See the `Running Docker Builds`_ for more details. Review comment: Updated. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on pull request #7096: ARROW-8644: [Python] Restore ParquetDataset behaviour to always include partition column for dask compatibility
jorisvandenbossche commented on pull request #7096: URL: https://github.com/apache/arrow/pull/7096#issuecomment-623635439 So the question comes up if we actually should also not revert the behaviour in case of `use_legacy_dataset=False` (the This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419646070 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to +``docker-compose run``, but with builtin support for building hierarchical +images. + +Examples + + +**Execute a build:** + +.. code:: bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code:: bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +**To disable the image pulling:** + +.. code:: bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code:: bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +**To disable the cache only for the leaf image:** + +Useful to force building the development version of a dependency. +The leaf image is ``conda-python-pandas`` in the example. + +.. code:: bash + +PANDAS=master archery docker run --no-cache-leaf conda-python-pandas + +Which translates to: + +.. code:: bash + +export PANDAS=master +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose build --no-cache conda-python-pandas +docker-compose run --rm conda-python-pandas + +Note that it doens't pull the conda-python-pandas image and disable the cache +when building it. + +``PANDAS`` is a `build parameter `_, see the +defaults in the .env file. + +**To entirely skip building the image:** + +.. code:: bash + +archery docker run --no-build conda-python + +In order to alter the runtime parameters pass docker environment variables to +the container during its execution: + +.. code:: bash + +archery docker run --env CMAKE_BUILD_TYPE=release ubuntu-cpp + +See the available C++ in the ``ci/scripts/cpp_build.sh`` script. Review comment: Updated. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419646333 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to +``docker-compose run``, but with builtin support for building hierarchical +images. + +Examples + + +**Execute a build:** + +.. code:: bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code:: bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +**To disable the image pulling:** + +.. code:: bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code:: bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +**To disable the cache only for the leaf image:** + +Useful to force building the development version of a dependency. +The leaf image is ``conda-python-pandas`` in the example. + +.. code:: bash + +PANDAS=master archery docker run --no-cache-leaf conda-python-pandas + +Which translates to: + +.. code:: bash + +export PANDAS=master +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose build --no-cache conda-python-pandas +docker-compose run --rm conda-python-pandas + +Note that it doens't pull the conda-python-pandas image and disable the cache +when building it. + +``PANDAS`` is a `build parameter `_, see the +defaults in the .env file. + +**To entirely skip building the image:** + +.. code:: bash + +archery docker run --no-build conda-python + +In order to alter the runtime parameters pass docker environment variables to +the container during its execution: + +.. code:: bash + +archery docker run --env CMAKE_BUILD_TYPE=release ubuntu-cpp + +See the available C++ in the ``ci/scripts/cpp_build.sh`` script. + +**Run the image with custom command:** + +Custom docker commands may be passed as the second argument. The following Review comment: Of the `archery docker run` command. ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to +``docker-compose run``, but with builtin support for building hierarchical +images. + +Examples + + +**Execute a build:** + +.. code:: bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code:: bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +**To disable the image pulling:** + +.. code:: bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code:: bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +
[GitHub] [arrow] pitrou commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
pitrou commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419602697 ## File path: cpp/src/arrow/util/bit_util.h ## @@ -610,6 +618,101 @@ class FirstTimeBitmapWriter { } } + /// Appends number_of_bits from word to valid_bits and valid_bits_offset. + /// + /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed + ///to be zero. + /// \param[in] number_of_bits The number of bits to append from word. + void AppendWord(uint64_t word, int64_t number_of_bits) { +#if defined(ARROW_LITTLE_ENDIAN) +// Selection masks to retrieve all low order bits for each bytes. +constexpr uint64_t kLsbSelectionMasks[] = { +0, // unused. +0x0101010101010101, +0x0303030303030303, +0x0707070707070707, +0x0F0F0F0F0F0F0F0F, +0x1F1F1F1F1F1F1F1F, +0x3F3F3F3F3F3F3F3F, +0x7F7F7F7F7F7F7F7F, +}; +if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { + return; +} + +// Location that the first byte needs to be written to. +uint8_t* append_position = bitmap_ + byte_offset_; + +// Update state variables except for current_byte_ here. +position_ += number_of_bits; +int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast(bit_mask_)); +bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8]; +byte_offset_ += (bit_offset + number_of_bits) / 8; + +if (bit_offset != 0) { + // We are in the middle of the byte. This code updates the byte and shifts the word + // so trailing bits can be appended below. + int64_t bits_to_carry = 8 - bit_offset; + // Get the mask the will select the lower order bits (the ones to carry + // over to the existing byte and shift up. + const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; + // Mask to select non-carried bits. + const uint64_t non_carry_mask = ~carry_mask; + + // valid bits should be a valid bitmask so all trailing bytes hsould be unset + // so no mask is need to start. + current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset); + if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { +return; + } + *append_position = current_byte_; + append_position++; + + // We illustrate logic with a 3-byte example in little endian/LSB order + // (N indicates not set bit Y indicates a set bit). + // Note this ordering is the reversed from HEX masks above with are expressed + // big-endian/MSB and shifts right move the bits to the left (division). + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + // Assuming a bit-offset of 6 the non_carry_mask looks like: + // N N Y Y Y Y Y Y N N Y Y Y Y Y YN N Y Y Y Y Y Y + // So shifted_word should look like; + // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N + // clang-format on + uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry; + // captured_carry: + // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N + uint64_t captured_carry = carry_mask & word; + // mask_cary_bits: + // N N N N N N 8 9 N N N N N N 16 17N N N N N N N N + uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; + + word = shifted_word | mask_carry_bits; + number_of_bits -= bits_to_carry; +} + +int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits); +std::memcpy(append_position, , bytes_for_word); +// At this point, we are guaranteed to have flushed the previous current_byte_ state. +// So the new state is either the last relevant byte in 'word' +// or zero if we happen to be at a fresh byte. +current_byte_ = +bit_mask_ != 0x1 ? *(reinterpret_cast() + bytes_for_word - 1) : 0; Review comment: This is cryptic. Can you rewrite it in a more understandable way? For example (untested): ```cpp if (bit_mask_ == 0x1) { current_byte_ = 0; } else { current_byte_ = *(append_position + bytes_for_word - 1); } ``` ## File path: cpp/src/arrow/util/bit_util_test.cc ## @@ -315,6 +318,115 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { } } +std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) { + return arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString(); +} + +std::string BitmapToString(const std::vector& bitmap, int64_t bit_count) { + return BitmapToString(bitmap.data(), bit_count); +} + +TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) { + auto check_append = [](const std::string& expected_bits, int64_t offset) { +std::vector valid_bits = {0x00}; Review comment: For each of the tests here it would be nice to
[GitHub] [arrow] pitrou commented on a change in pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
pitrou commented on a change in pull request #6985: URL: https://github.com/apache/arrow/pull/6985#discussion_r419593319 ## File path: cpp/src/arrow/util/bit_util.h ## @@ -43,13 +43,18 @@ #if defined(_MSC_VER) #include +#include #pragma intrinsic(_BitScanReverse) #pragma intrinsic(_BitScanForward) #define ARROW_BYTE_SWAP64 _byteswap_uint64 #define ARROW_BYTE_SWAP32 _byteswap_ulong +#define ARROW_POPCOUNT64 __popcnt64 +#define ARROW_POPCOUNT32 __popcnt #else #define ARROW_BYTE_SWAP64 __builtin_bswap64 #define ARROW_BYTE_SWAP32 __builtin_bswap32 +#define ARROW_POPCOUNT64 __builtin_popcountll +#define ARROW_POPCOUNT32 __builtin_popcount Review comment: I'd say 2027. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on pull request #7032: ARROW-6603: [C#] Adds ArrayBuilder API to support writing null values + BooleanArray null support
wesm commented on pull request #7032: URL: https://github.com/apache/arrow/pull/7032#issuecomment-623587071 @mr-smidge I don't think a CCLA is necessary for you to contribute. We've seen organizations require an ASF ICLA but extremely rarely a CCLA (in fact, many corporate attorneys will _not_ sign a CCLA). If your corporate counsel is unsure about this I would be happy to help This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419541129 ## File path: docs/source/example1.dat ## @@ -0,0 +1 @@ +some data Review comment: Nope. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419538949 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to Review comment: The command's interface tries to correspond to `docker-compose **run**` with the addition to build the required images. ```bash docker-compose run archery docker run docker-compose run --env VAR_A=A -e VAR_B=B archery docker run --env VAR_A=A -e VAR_B=B ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] removed a comment on pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
github-actions[bot] removed a comment on pull request #7021: URL: https://github.com/apache/arrow/pull/7021#issuecomment-618494095 Thanks for opening a pull request! Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW Then could you also rename pull request title in the following format? ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} See also: * [Other pull requests](https://github.com/apache/arrow/pulls/) * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
nealrichardson commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419518977 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to Review comment: Does it though? `run` here seems to entail `build && run`. ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: Review comment: An immediate followup, right? I'm not sure that I agree with deferring but I'll allow it as long as it's the next thing you do. It looks like you already added some of the relevant information below, not sure how much more you need. ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python Review comment: Sounds like YAGNI to me, anyone else have an opinion? ## File path: docs/source/example1.dat ## @@ -0,0 +1 @@ +some data Review comment: Was this supposed to be checked in? ## File path: docs/source/developers/integration.rst ## @@ -56,15 +56,19 @@ build mount is used for caching and sharing state between staged images. You can build and run a service by using the `build` and `run` docker-compose sub-command, e.g. `docker-compose build python && docker-compose run python`. We do not publish the build images, you need to build them manually. This -method requires the user to build the images in reverse dependency order. To -simplify this, we provide a Makefile. +method requires the user to build the images in reverse dependency order. .. code-block:: shell # Build and run manually - docker-compose build cpp - docker-compose build python - docker-compose run python + docker-compose build conda-cpp + docker-compose build conda-python + docker-compose run conda-python - # Using the makefile with proper image dependency resolution - make -f Makefile.docker python +To simplify this, Archery provides a command for it: + +.. code-block:: shell + + archery docker run conda-python + +See the `Running Docker Builds`_ for more details. Review comment: ```suggestion See `Running Docker Builds`_ for more details. ``` ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,197 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommended is +to use the archery tool: + +Installation + + +Requires ``python>=3.5``. It is recommended to install archery in ``editable`` +mode to automatically update the intallation by pulling the arrow repository. + +.. code:: bash + +pip install -e dev/archery[docker] + +Inspect the available commands and options +~~ + +.. code:: bash + +archery docker --help +archery docker run --help + +``archery docker run`` tries to provide a similar interface to +``docker-compose run``,
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419528025 ## File path: .github/workflows/archery.yml ## @@ -51,10 +53,12 @@ jobs: python-version: '3.7' - name: Install working-directory: dev/archery -run: pip install pytest responses ruamel.yaml toolz jinja2 -e . +run: pip install pytest python-dotenv responses ruamel.yaml toolz jinja2 -e . Review comment: There is `-e .` at the and of the command. The additional dependencies listed here are required for testing and crossbow. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7099: ARROW-8693: [Python] Insert implicit cast in Dataset.get_fragments with filter
github-actions[bot] commented on pull request #7099: URL: https://github.com/apache/arrow/pull/7099#issuecomment-623534789 https://issues.apache.org/jira/browse/ARROW-8693 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche opened a new pull request #7099: ARROW-8693: [Python] Insert implicit cast in Dataset.get_fragments with filter
jorisvandenbossche opened a new pull request #7099: URL: https://github.com/apache/arrow/pull/7099 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on pull request #6985: ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column
nealrichardson commented on pull request #6985: URL: https://github.com/apache/arrow/pull/6985#issuecomment-623520253 @emkornfield that looks like the same R Windows 32-bit failure to me. I'm not sure I understand your other question. Are you saying you want to use the old (status quo) code on 32-bit? That sounds like something you'd detect in cmake to me. If the C++ library needs to be built with different flags, the R Windows build script is in ci/scripts/PKGBUILD. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
pitrou commented on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623507548 @lidavidm It would be nice if you could run the benchmarks and post number on your setup (perhaps on S3 too?). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou edited a comment on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
pitrou edited a comment on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623507548 @lidavidm It would be nice if you could run the benchmarks and post numbers on your setup (perhaps on S3 too?). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
github-actions[bot] commented on pull request #7098: URL: https://github.com/apache/arrow/pull/7098#issuecomment-623499876 https://issues.apache.org/jira/browse/ARROW-8692 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou opened a new pull request #7098: ARROW-8692: [C++] Avoid memory copies when downloading from S3
pitrou opened a new pull request #7098: URL: https://github.com/apache/arrow/pull/7098 The AWS SDK creates a auto-growing StringStream by default, entailing multiple memory copies when transferring large data blocks (because of resizes). Instead, write directly into the target data area. Low-level benchmarks with a local Minio server: * before: ``` - Benchmark Time CPU Iterations UserCounters... - MinioFixture/ReadAll500Mib/real_time434528630 ns431461370 ns 2 bytes_per_second=1.1237G/s items_per_second=2.30134/s MinioFixture/ReadChunked500Mib/real_time419380389 ns339293384 ns 2 bytes_per_second=1.16429G/s items_per_second=2.38447/s MinioFixture/ReadCoalesced500Mib/real_time 258812283 ns 470149 ns 3 bytes_per_second=1.88662G/s items_per_second=3.8638/s ``` * after: ``` MinioFixture/ReadAll500Mib/real_time194620947 ns161227337 ns 4 bytes_per_second=2.50888G/s items_per_second=5.13819/s MinioFixture/ReadChunked500Mib/real_time276437393 ns183030215 ns 3 bytes_per_second=1.76634G/s items_per_second=3.61746/s MinioFixture/ReadCoalesced500Mib/real_time 86693750 ns 448568 ns 6 bytes_per_second=5.63225G/s items_per_second=11.5349/s ``` Parquet read benchmarks from a local Minio server show speedups from 1.1x to 1.9x. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl
wesm commented on a change in pull request #7088: URL: https://github.com/apache/arrow/pull/7088#discussion_r419459324 ## File path: cpp/src/arrow/csv/converter.cc ## @@ -381,32 +383,98 @@ class NumericConverter : public ConcreteConverter { / // Concrete Converter for timestamps +namespace { + +struct InlineISO8601 { Review comment: That class needs to know the TimeUnit at construction time but it is not known yet when this class is constructed This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on a change in pull request #7089: ARROW-8657: [C++][Python] Add separate configuration for data pages
wesm commented on a change in pull request #7089: URL: https://github.com/apache/arrow/pull/7089#discussion_r419458260 ## File path: cpp/src/parquet/properties.h ## @@ -34,10 +34,14 @@ namespace parquet { +/// Control for data types in parquet. struct ParquetVersion { enum type { PARQUET_1_0, PARQUET_2_0 }; }; +/// Controls layout of data pages. Review comment: parquet-format v2.0.0 introduced a data page metadata and serialized page structure (for example, encoded levels are no longer compressed) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on pull request #7089: ARROW-8657: [C++][Python] Add separate configuration for data pages
wesm commented on pull request #7089: URL: https://github.com/apache/arrow/pull/7089#issuecomment-623480713 Sorry fat-fingered the review request. I will take a look at this This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #7094: ARROW-8689: [C++] Fix linking S3FS benchmarks
pitrou commented on pull request #7094: URL: https://github.com/apache/arrow/pull/7094#issuecomment-623469326 Ok, rebasing. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7097: ARROW-8690: [Python] Clean-up dataset+parquet tests now order is determinstic
github-actions[bot] commented on pull request #7097: URL: https://github.com/apache/arrow/pull/7097#issuecomment-623457675 https://issues.apache.org/jira/browse/ARROW-8690 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419416691 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code::bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +To disable the image pulling: + +.. code::bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code::bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +To disable the cache only for the leaf image - useful to force building the +development version of a dependency: + +.. code::bash + +PANDAS=master archery docker run --no-cache-leaf conda-python + +``PANDAS`` is a build time parameter, see the defaults in the .env file. + +To entirely skip building the image: + +.. code::bash + +archery docker run --no-build conda-python + +In order to alter the runtime parameters pass environment variables to the +container in execution time: + +.. code::bash + +archery docker run -e CMAKE_BUILD_TYPE=release ubuntu-cpp Review comment: The ones defined in cpp_build.sh. Added a reference for it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419413899 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code::bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +To disable the image pulling: + +.. code::bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code::bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +To disable the cache only for the leaf image - useful to force building the +development version of a dependency: + +.. code::bash + +PANDAS=master archery docker run --no-cache-leaf conda-python + +``PANDAS`` is a build time parameter, see the defaults in the .env file. + +To entirely skip building the image: + +.. code::bash + +archery docker run --no-build conda-python + +In order to alter the runtime parameters pass environment variables to the +container in execution time: + +.. code::bash + +archery docker run -e CMAKE_BUILD_TYPE=release ubuntu-cpp + +Starting an interactive bash session for debugging: + +.. code::bash + +archery docker run ubuntu-cpp bash + +Development +--- + +The docker-compose configuration is tuned towards reusable development +containers using hierarchical images. For example multiple language bindings +are dependent on the C++ implementation, so instead of redefining the +C++ environment multiple Dockerfiles, we can reuse the exact same base C++ +image when building Glib, Ruby, R and Python bindings. +This helps reducing duplications and preventing a series of maintenance, but +makes the docker-compose configuration more complicated. + +Docker Build Parameters Review comment: Added. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419413778 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code::bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +To disable the image pulling: + +.. code::bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code::bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +To disable the cache only for the leaf image - useful to force building the +development version of a dependency: + +.. code::bash + +PANDAS=master archery docker run --no-cache-leaf conda-python + +``PANDAS`` is a build time parameter, see the defaults in the .env file. + +To entirely skip building the image: + +.. code::bash + +archery docker run --no-build conda-python + +In order to alter the runtime parameters pass environment variables to the +container in execution time: + +.. code::bash + +archery docker run -e CMAKE_BUILD_TYPE=release ubuntu-cpp + +Starting an interactive bash session for debugging: Review comment: I'm not sure what do you mean, updated to be a bit more verbose. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7097: ARROW-8690: [Python] Clean-up dataset+parquet tests now order is determinstic
github-actions[bot] commented on pull request #7097: URL: https://github.com/apache/arrow/pull/7097#issuecomment-623445892 Revision: 065dc03fc971c34c7d008283ef399b88939f8e98 Submitted crossbow builds: [ursa-labs/crossbow @ actions-201](https://github.com/ursa-labs/crossbow/branches/all?query=actions-201) |Task|Status| ||--| |test-conda-python-3.6|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-201-azure-test-conda-python-3.6)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1=actions-201-azure-test-conda-python-3.6)| |test-conda-python-3.6-pandas-0.23|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.6-pandas-0.23.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.6-pandas-0.23)| |test-conda-python-3.7|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-201-azure-test-conda-python-3.7)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1=actions-201-azure-test-conda-python-3.7)| |test-conda-python-3.7-dask-latest|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-dask-latest.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-dask-latest)| |test-conda-python-3.7-hdfs-2.9.2|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-hdfs-2.9.2.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-hdfs-2.9.2)| |test-conda-python-3.7-kartothek-latest|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-kartothek-latest.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-kartothek-latest)| |test-conda-python-3.7-kartothek-master|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-kartothek-master.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-kartothek-master)| |test-conda-python-3.7-pandas-latest|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-pandas-latest.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-pandas-latest)| |test-conda-python-3.7-pandas-master|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-pandas-master.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-pandas-master)| |test-conda-python-3.7-spark-master|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-spark-master.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-spark-master)| |test-conda-python-3.7-turbodbc-latest|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-turbodbc-latest.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-turbodbc-latest)| |test-conda-python-3.7-turbodbc-master|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.7-turbodbc-master.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.7-turbodbc-master)| |test-conda-python-3.8|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-201-azure-test-conda-python-3.8)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1=actions-201-azure-test-conda-python-3.8)| |test-conda-python-3.8-dask-master|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.8-dask-master.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.8-dask-master)| |test-conda-python-3.8-jpype|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.8-jpype.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.8-jpype)| |test-conda-python-3.8-pandas-latest|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-201-circle-test-conda-python-3.8-pandas-latest.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-201-circle-test-conda-python-3.8-pandas-latest)|
[GitHub] [arrow] lidavidm commented on a change in pull request #7012: ARROW-8555: [FlightRPC][Java] implement DoExchange
lidavidm commented on a change in pull request #7012: URL: https://github.com/apache/arrow/pull/7012#discussion_r419413303 ## File path: java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java ## @@ -293,6 +292,76 @@ public void onCompleted() { return stream; } + /** + * Initiate a bidirectional data exchange with the server. + * + * @param descriptor A descriptor for the data stream. + * @param options RPC call options. + * @return A pair of a readable stream and a writable stream. + */ + public ExchangeReaderWriter doExchange(FlightDescriptor descriptor, CallOption... options) { +Preconditions.checkNotNull(descriptor); +final io.grpc.CallOptions callOptions = CallOptions.wrapStub(asyncStub, options).getCallOptions(); + +try { + final ClientCall call = interceptedChannel.newCall(doExchangeDescriptor, callOptions); + final FlightStream stream = new FlightStream(allocator, PENDING_REQUESTS, call::cancel, call::request); + final ClientCallStreamObserver observer = (ClientCallStreamObserver) + ClientCalls.asyncBidiStreamingCall(call, stream.asObserver()); + final ClientStreamListener writer = new PutObserver( + descriptor, observer, stream.completed::isDone, + () -> { +try { + stream.completed.get(); +} catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw CallStatus.INTERNAL.withDescription("Client error: interrupted").withCause(e).toRuntimeException(); +} catch (ExecutionException e) { + throw CallStatus.INTERNAL.withDescription("Client error: " + e).withCause(e).toRuntimeException(); +} + }); + // Send the descriptor to start. + try (final ArrowMessage message = new ArrowMessage(descriptor.toProtocol())) { +observer.onNext(message); + } catch (Exception e) { +throw CallStatus.INTERNAL +.withCause(e) +.withDescription("Could not write descriptor message: " + e) Review comment: I've gone and fixed the other occurrences. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on pull request #7097: ARROW-8690: [Python] Clean-up dataset+parquet tests now order is determinstic
jorisvandenbossche commented on pull request #7097: URL: https://github.com/apache/arrow/pull/7097#issuecomment-623445265 @github-actions crossbow submit -g python This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] lidavidm commented on a change in pull request #7012: ARROW-8555: [FlightRPC][Java] implement DoExchange
lidavidm commented on a change in pull request #7012: URL: https://github.com/apache/arrow/pull/7012#discussion_r419412323 ## File path: java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestDoExchange.java ## @@ -0,0 +1,395 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; +import java.util.stream.IntStream; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import io.netty.buffer.ArrowBuf; + +public class TestDoExchange { + static byte[] EXCHANGE_DO_GET = "do-get".getBytes(StandardCharsets.UTF_8); + static byte[] EXCHANGE_DO_PUT = "do-put".getBytes(StandardCharsets.UTF_8); + static byte[] EXCHANGE_ECHO = "echo".getBytes(StandardCharsets.UTF_8); + static byte[] EXCHANGE_METADATA_ONLY = "only-metadata".getBytes(StandardCharsets.UTF_8); + static byte[] EXCHANGE_TRANSFORM = "transform".getBytes(StandardCharsets.UTF_8); + + private BufferAllocator allocator; + private FlightServer server; + private FlightClient client; + + @Before + public void setUp() throws Exception { +allocator = new RootAllocator(Integer.MAX_VALUE); +final Location serverLocation = Location.forGrpcInsecure(FlightTestUtil.LOCALHOST, 0); +server = FlightServer.builder(allocator, serverLocation, new Producer(allocator)).build(); +server.start(); +final Location clientLocation = Location.forGrpcInsecure(FlightTestUtil.LOCALHOST, server.getPort()); +client = FlightClient.builder(allocator, clientLocation).build(); + } + + @After + public void tearDown() throws Exception { +AutoCloseables.close(client, server, allocator); + } + + /** Test a pure-metadata flow. */ + @Test + public void testDoExchangeOnlyMetadata() throws Exception { +// Send a particular descriptor to the server and check for a particular response pattern. +try (final FlightClient.ExchangeReaderWriter stream = + client.doExchange(FlightDescriptor.command(EXCHANGE_METADATA_ONLY))) { + final FlightStream reader = stream.getReader(); + + // Server starts by sending a message without data (hence no VectorSchemaRoot should be present) + assertTrue(reader.next()); + assertFalse(reader.hasRoot()); + assertEquals(42, reader.getLatestMetadata().getInt(0)); + + // Write a metadata message to the server (without sending any data) + ArrowBuf buf = allocator.buffer(4); + buf.writeInt(84); + stream.getWriter().putMetadata(buf); + + // Check that the server echoed the metadata back to us + assertTrue(reader.next()); + assertFalse(reader.hasRoot()); + assertEquals(84, reader.getLatestMetadata().getInt(0)); + + // Close our write channel and ensure the server also closes theirs + stream.getWriter().completed(); + assertFalse(reader.next()); +} + } + + /** Emulate a DoGet with a DoExchange. */ + @Test + public void testDoExchangeDoGet() throws Exception { +try (final FlightClient.ExchangeReaderWriter stream = + client.doExchange(FlightDescriptor.command(EXCHANGE_DO_GET))) { + final FlightStream reader = stream.getReader(); + VectorSchemaRoot root = reader.getRoot(); + IntVector iv =
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419401740 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code::bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +To disable the image pulling: + +.. code::bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code::bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +To disable the cache only for the leaf image - useful to force building the +development version of a dependency: + +.. code::bash + +PANDAS=master archery docker run --no-cache-leaf conda-python + +``PANDAS`` is a build time parameter, see the defaults in the .env file. + +To entirely skip building the image: + +.. code::bash + +archery docker run --no-build conda-python + +In order to alter the runtime parameters pass environment variables to the +container in execution time: Review comment: Shorthand for `--env`. Using the full parameter name. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419400510 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code::bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +To disable the image pulling: + +.. code::bash + +archery docker run --no-cache conda-python + +Which translates to: + +.. code::bash + +docker-compose build --no-cache conda-cpp +docker-compose build --no-cache conda-python +docker-compose run --rm conda-python + +To disable the cache only for the leaf image - useful to force building the +development version of a dependency: + +.. code::bash + +PANDAS=master archery docker run --no-cache-leaf conda-python + +``PANDAS`` is a build time parameter, see the defaults in the .env file. Review comment: Described below, added a reference. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7096: ARROW-8644: [Python] Restore ParquetDataset behaviour to always include partition column for dask compatibility
github-actions[bot] commented on pull request #7096: URL: https://github.com/apache/arrow/pull/7096#issuecomment-623433630 Revision: 3e480a91833c7cd401fa120c520e5a51dad2d58a Submitted crossbow builds: [ursa-labs/crossbow @ actions-200](https://github.com/ursa-labs/crossbow/branches/all?query=actions-200) |Task|Status| ||--| |test-conda-python-3.7-dask-latest|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-200-circle-test-conda-python-3.7-dask-latest.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-200-circle-test-conda-python-3.7-dask-latest)| |test-conda-python-3.8-dask-master|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-200-circle-test-conda-python-3.8-dask-master.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-200-circle-test-conda-python-3.8-dask-master)| This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #7094: ARROW-8689: [C++] Fix linking S3FS benchmarks
pitrou commented on pull request #7094: URL: https://github.com/apache/arrow/pull/7094#issuecomment-623432600 Java issues on CI look unrelated. @kszucs can you confirm? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7095: ARROW-8664: [Java] Add flag to skip null check
github-actions[bot] commented on pull request #7095: URL: https://github.com/apache/arrow/pull/7095#issuecomment-623428933 https://issues.apache.org/jira/browse/ARROW-8664 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7096: ARROW-8644: [Python] Restore ParquetDataset behaviour to always include partition column for dask compatibility
github-actions[bot] commented on pull request #7096: URL: https://github.com/apache/arrow/pull/7096#issuecomment-623428932 https://issues.apache.org/jira/browse/ARROW-8644 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche opened a new pull request #7096: ARROW-8644: [Python] Restore ParquetDataset behaviour to always include partition column for dask compatibility
jorisvandenbossche opened a new pull request #7096: URL: https://github.com/apache/arrow/pull/7096 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] liyafan82 commented on pull request #6729: ARROW-8229: [Java] Move ArrowBuf into the Arrow package
liyafan82 commented on pull request #6729: URL: https://github.com/apache/arrow/pull/6729#issuecomment-623427960 > It will be good to link the related issues in the PR description. @siddharthteotia Thanks a lot for your effort. I have updated the description. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr opened a new pull request #7095: ARROW-8664: [Java] Add flag to skip null check
rymurr opened a new pull request #7095: URL: https://github.com/apache/arrow/pull/7095 All Vector containers should skip null check when null check flag is enabled This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr commented on pull request #7093: ARROW-8687: [Java] Remove references to io.netty.buffer.ArrowBuf
rymurr commented on pull request #7093: URL: https://github.com/apache/arrow/pull/7093#issuecomment-623426298 Thanks both! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7094: ARROW-8689: [C++] Fix linking S3FS benchmarks
github-actions[bot] commented on pull request #7094: URL: https://github.com/apache/arrow/pull/7094#issuecomment-623422812 https://issues.apache.org/jira/browse/ARROW-8689 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou opened a new pull request #7094: ARROW-8689: [C++] Fix linking S3FS benchmarks
pitrou opened a new pull request #7094: URL: https://github.com/apache/arrow/pull/7094 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419372644 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code::bash + +docker-compose pull --ignore-pull-failures conda-cpp +docker-compose build conda-cpp +docker-compose pull --ignore-pull-failures conda-python +docker-compose build conda-python +docker-compose run --rm conda-python + +To disable the image pulling: Review comment: Yes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419353296 ## File path: docs/source/developers/docker.rst ## @@ -0,0 +1,143 @@ +.. raw:: html + + + +Running Docker Builds += + +Most of our Linux based continuous integration tasks are decoupled from public +CI services using docker and docker-compose. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +- + +There are multiple ways to execute the docker based builds. The recommented is +to use the archery tool: + +Installation: + +.. code:: bash + +pip install -e dev/archery[docker] + +Execute a build: + +.. code::bash + +archery docker run conda-python Review comment: I kept that open for features like listing available build(er)s. I can change this to something like `archery build conda-python` if you find it more convenient. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #6959: ARROW-5649: [Integration][C++] Create integration test for extension types
pitrou commented on pull request #6959: URL: https://github.com/apache/arrow/pull/6959#issuecomment-623381960 @wesm Do you want to take a look at this? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl
pitrou commented on a change in pull request #7088: URL: https://github.com/apache/arrow/pull/7088#discussion_r419336015 ## File path: cpp/src/arrow/util/value_parsing.cc ## @@ -79,5 +86,46 @@ bool StringToFloatConverter::StringToFloat(const char* s, size_t length, double* return true; } +// -- +// strptime-like parsing + +class StrptimeTimestampParser : public TimestampParser { + public: + explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {} + + bool operator()(const char* s, size_t length, TimeUnit::type out_unit, + value_type* out) const override { +arrow_vendored::date::sys_seconds time_point; +std::istringstream stream(std::string(s, length)); +if (stream >> arrow_vendored::date::parse(format_, time_point)) { Review comment: Shouldn't you also check that the stream is exhausted? Otherwise trailing data will be ignored. ## File path: cpp/src/arrow/util/value_parsing.h ## @@ -356,177 +377,194 @@ class StringConverter : public StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; }; -template <> -class StringConverter { - public: - using value_type = TimestampType::c_type; +// Inline-able ISO-8601 parser - explicit StringConverter(const std::shared_ptr& type) - : unit_(checked_cast(type.get())->unit()) {} +namespace detail { - bool operator()(const char* s, size_t length, value_type* out) { -// We allow the following formats: -// - "-MM-DD" -// - "-MM-DD[ T]hh" -// - "-MM-DD[ T]hhZ" -// - "-MM-DD[ T]hh:mm" -// - "-MM-DD[ T]hh:mmZ" -// - "-MM-DD[ T]hh:mm:ss" -// - "-MM-DD[ T]hh:mm:ssZ" -// UTC is always assumed, and the DataType's timezone is ignored. -arrow_vendored::date::year_month_day ymd; -if (ARROW_PREDICT_FALSE(length < 10)) { - return false; -} -if (length == 10) { - if (ARROW_PREDICT_FALSE(!Parse_MM_DD(s, ))) { -return false; - } - return ConvertTimePoint(arrow_vendored::date::sys_days(ymd), out); -} -if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) { - return false; -} -if (s[length - 1] == 'Z') { - --length; -} -if (length == 13) { - if (ARROW_PREDICT_FALSE(!Parse_MM_DD(s, ))) { -return false; - } - std::chrono::duration seconds; - if (ARROW_PREDICT_FALSE(!ParseHH(s + 11, ))) { -return false; - } - return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out); -} -if (length == 16) { - if (ARROW_PREDICT_FALSE(!Parse_MM_DD(s, ))) { -return false; - } - std::chrono::duration seconds; - if (ARROW_PREDICT_FALSE(!ParseHH_MM(s + 11, ))) { -return false; - } - return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out); -} -if (length == 19) { - if (ARROW_PREDICT_FALSE(!Parse_MM_DD(s, ))) { -return false; - } - std::chrono::duration seconds; - if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, ))) { -return false; - } - return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out); -} +using ts_type = TimestampType::c_type; + +template +static inline ts_type ConvertTimePoint(TimePoint tp, TimeUnit::type unit) { + auto duration = tp.time_since_epoch(); + switch (unit) { +case TimeUnit::SECOND: + return std::chrono::duration_cast(duration).count(); +case TimeUnit::MILLI: + return std::chrono::duration_cast(duration).count(); +case TimeUnit::MICRO: + return std::chrono::duration_cast(duration).count(); +case TimeUnit::NANO: + return std::chrono::duration_cast(duration).count(); +default: + // Compiler errors without default case even though all enum cases are handled + assert(0); + return 0; + } +} + +static inline bool Parse_MM_DD(const char* s, + arrow_vendored::date::year_month_day* out) { + uint16_t year; + uint8_t month, day; + if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) { return false; } + if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, ))) { +return false; + } + if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, ))) { +return false; + } + if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, ))) { +return false; + } + *out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month}, + arrow_vendored::date::day{day}}; + return out->ok(); +} - protected: - template - bool ConvertTimePoint(TimePoint tp, value_type* out) { -auto duration = tp.time_since_epoch(); -switch (unit_) { - case TimeUnit::SECOND: -*out = std::chrono::duration_cast(duration).count(); -return true; - case TimeUnit::MILLI: -
[GitHub] [arrow] kszucs commented on a change in pull request #7021: ARROW-8628: [Dev] Wrap docker-compose commands with archery
kszucs commented on a change in pull request #7021: URL: https://github.com/apache/arrow/pull/7021#discussion_r419339195 ## File path: .github/workflows/java.yml ## @@ -38,6 +38,8 @@ on: env: DOCKER_BUILDKIT: 0 COMPOSE_DOCKER_CLI_BUILD: 1 + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_TOKEN }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_USER }} Review comment: Missed that. I'm not sure what would the best way to reduce the maintenance cost of this yml jungle. One possibility to use a crossbow like yml generator, but it has numerous disadvantages. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #7081: [CI] Cache docker volumes [WIP]
pitrou commented on pull request #7081: URL: https://github.com/apache/arrow/pull/7081#issuecomment-623367064 Did they increase the available cache size? Last I looked it was a fixed size for the entire repo. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] mr-smidge commented on pull request #7032: ARROW-6603: [C#] Adds ArrayBuilder API to support writing null values + BooleanArray null support
mr-smidge commented on pull request #7032: URL: https://github.com/apache/arrow/pull/7032#issuecomment-623353035 Hi @zgramana (and @eerhardt). I was independently working on nullable array builder support (but have not been able to contribute just yet as my organisation needs to sign a CCLA). You beat me to it :-). However, I notice that you have baked in "core" validity buffer support into `BooleanArray.Builder`, and then other array builders nest a `BooleanArray.Builder` as their validity buffer, even though a bool array builder contains both a value and validity buffer. I took a slightly different approach, whereby I created an independent "bit buffer builder" that packs bits into an `ArrowBuffer` of bytes. Array builders then use a bit buffer builder for their validity map, and the boolean array builder uses two (one for values, one for validity). Once the CCLA is sorted, would there be any appetite for a PR that makes this refactoring? (In my personal opinion, it is cleaner not to nest `BooleanArray.Builder` within other builders - and this was my motivation for the bit buffer builder.) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr commented on pull request #7084: ARROW-8664: [Java] Add flag to skip null check
rymurr commented on pull request #7084: URL: https://github.com/apache/arrow/pull/7084#issuecomment-623343842 build is dependent on #7093 and rebase This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7093: ARROW-8687: [Java] Remove references to io.netty.buffer.ArrowBuf
github-actions[bot] commented on pull request #7093: URL: https://github.com/apache/arrow/pull/7093#issuecomment-623337581 https://issues.apache.org/jira/browse/ARROW-8687 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr commented on pull request #7093: [Java] ARROW-8687: Remove references to io.netty.buffer.ArrowBuf
rymurr commented on pull request #7093: URL: https://github.com/apache/arrow/pull/7093#issuecomment-623318684 @liyafan82 just noticed a few entries of `io.netty.buffer.ArrowBuf` after the recent merge of your patch to move it to `com.apache.arrow.memory.ArrowBuf` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7093: [Java] ARROW-8687: Remove references to io.netty.buffer.ArrowBuf
github-actions[bot] commented on pull request #7093: URL: https://github.com/apache/arrow/pull/7093#issuecomment-623318389 Thanks for opening a pull request! Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW Then could you also rename pull request title in the following format? ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} See also: * [Other pull requests](https://github.com/apache/arrow/pulls/) * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr opened a new pull request #7093: [Java] ARROW-8687: Remove references to io.netty.buffer.ArrowBuf
rymurr opened a new pull request #7093: URL: https://github.com/apache/arrow/pull/7093 Some references to `io.netty.buffer.ArrowBuf` were missed off in ARROW-8229. This cleans up the last remaining references. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] hantusk commented on issue #7082: pyarrow 0.17 atexit handler causes a segmentation fault
hantusk commented on issue #7082: URL: https://github.com/apache/arrow/issues/7082#issuecomment-623301347 Yes, macOS running python 3.7.5 or 3.7.7. I will try and reproduce and continue commenting in the JIRA issue. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns
nevi-me commented on a change in pull request #6770: URL: https://github.com/apache/arrow/pull/6770#discussion_r419233839 ## File path: rust/parquet/src/arrow/array_reader.rs ## @@ -468,6 +491,391 @@ where } } +/// Implementation of list array reader. +pub struct ListArrayReader { +item_reader: Box, +data_type: ArrowType, +item_type: ArrowType, +list_def_level: i16, +list_rep_level: i16, +def_level_buffer: Option, +rep_level_buffer: Option, +} + +impl ListArrayReader { +/// Construct list array reader. +pub fn new( +item_reader: Box, +data_type: ArrowType, +item_type: ArrowType, +def_level: i16, +rep_level: i16, +) -> Self { +Self { +item_reader, +data_type, +item_type, +list_def_level: def_level, +list_rep_level: rep_level, +def_level_buffer: None, +rep_level_buffer: None, +} +} +} + +macro_rules! build_empty_list_array_with_primitive_items { +($item_type:ident) => {{ +let values_builder = PrimitiveBuilder::<$item_type>::new(0); +let mut builder = ListBuilder::new(values_builder); +let empty_list_array = builder.finish(); +Ok(Arc::new(empty_list_array)) +}}; +} + +macro_rules! build_empty_list_array_with_non_primitive_items { +($builder:ident) => {{ +let values_builder = $builder::new(0); +let mut builder = ListBuilder::new(values_builder); +let empty_list_array = builder.finish(); +Ok(Arc::new(empty_list_array)) +}}; +} + +fn build_empty_list_array(item_type: ArrowType) -> Result { +match item_type { +ArrowType::UInt8 => build_empty_list_array_with_primitive_items!(ArrowUInt8Type), +ArrowType::UInt16 => { +build_empty_list_array_with_primitive_items!(ArrowUInt16Type) +} +ArrowType::UInt32 => { +build_empty_list_array_with_primitive_items!(ArrowUInt32Type) +} +ArrowType::UInt64 => { +build_empty_list_array_with_primitive_items!(ArrowUInt64Type) +} +ArrowType::Int8 => build_empty_list_array_with_primitive_items!(ArrowInt8Type), +ArrowType::Int16 => build_empty_list_array_with_primitive_items!(ArrowInt16Type), +ArrowType::Int32 => build_empty_list_array_with_primitive_items!(ArrowInt32Type), +ArrowType::Int64 => build_empty_list_array_with_primitive_items!(ArrowInt64Type), +ArrowType::Float32 => { +build_empty_list_array_with_primitive_items!(ArrowFloat32Type) +} +ArrowType::Float64 => { +build_empty_list_array_with_primitive_items!(ArrowFloat64Type) +} +ArrowType::Boolean => { +build_empty_list_array_with_primitive_items!(ArrowBooleanType) +} +ArrowType::Date32(_) => { +build_empty_list_array_with_primitive_items!(ArrowDate32Type) +} +ArrowType::Date64(_) => { +build_empty_list_array_with_primitive_items!(ArrowDate64Type) +} +ArrowType::Time32(ArrowTimeUnit::Second) => { +build_empty_list_array_with_primitive_items!(ArrowTime32SecondType) +} +ArrowType::Time32(ArrowTimeUnit::Millisecond) => { + build_empty_list_array_with_primitive_items!(ArrowTime32MillisecondType) +} +ArrowType::Time64(ArrowTimeUnit::Microsecond) => { + build_empty_list_array_with_primitive_items!(ArrowTime64MicrosecondType) +} +ArrowType::Time64(ArrowTimeUnit::Nanosecond) => { + build_empty_list_array_with_primitive_items!(ArrowTime64NanosecondType) +} +ArrowType::Duration(ArrowTimeUnit::Second) => { Review comment: That's probably incorrect, but we can clarify/fix the behaviour of the readers at a later stage. ## File path: rust/parquet/src/arrow/array_reader.rs ## @@ -468,6 +491,391 @@ where } } +/// Implementation of list array reader. +pub struct ListArrayReader { +item_reader: Box, +data_type: ArrowType, +item_type: ArrowType, +list_def_level: i16, +list_rep_level: i16, +def_level_buffer: Option, +rep_level_buffer: Option, +} + +impl ListArrayReader { +/// Construct list array reader. +pub fn new( +item_reader: Box, +data_type: ArrowType, +item_type: ArrowType, +def_level: i16, +rep_level: i16, +) -> Self { +Self { +item_reader, +data_type, +item_type, +list_def_level: def_level, +list_rep_level: rep_level, +def_level_buffer: None, +rep_level_buffer: None, +} +} +} + +macro_rules! build_empty_list_array_with_primitive_items { +($item_type:ident) => {{ +let values_builder = PrimitiveBuilder::<$item_type>::new(0); +let mut