date:20200422

[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column

2020-04-22 Thread GitBox



emkornfield commented on a change in pull request #6985:
URL: https://github.com/apache/arrow/pull/6985#discussion_r413533870



##
File path: cpp/src/parquet/column_reader.cc
##
@@ -50,6 +51,141 @@ using arrow::internal::checked_cast;
 
 namespace parquet {
 
+namespace {
+
+inline void CheckLevelRange(const int16_t* levels, int64_t num_levels,
+const int16_t max_expected_level) {
+  int16_t min_level = std::numeric_limits::max();
+  int16_t max_level = std::numeric_limits::min();
+  for (int x = 0; x < num_levels; x++) {
+min_level = std::min(levels[x], min_level);
+max_level = std::max(levels[x], max_level);
+  }
+  if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > 
max_level))) {
+throw ParquetException("definition level exceeds maximum");
+  }
+}
+
+#if !defined(ARROW_HAVE_BMI2)
+
+inline void DefinitionLevelsToBitmapScalar(
+const int16_t* def_levels, int64_t num_def_levels, const int16_t 
max_definition_level,
+const int16_t max_repetition_level, int64_t* values_read, int64_t* 
null_count,
+uint8_t* valid_bits, int64_t valid_bits_offset) {
+  // We assume here that valid_bits is large enough to accommodate the
+  // additional definition levels and the ones that have already been written
+  ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, 
valid_bits_offset,
+num_def_levels);
+
+  // TODO(itaiin): As an interim solution we are splitting the code path here
+  // between repeated+flat column reads, and non-repeated+nested reads.
+  // Those paths need to be merged in the future
+  for (int i = 0; i < num_def_levels; ++i) {
+if (def_levels[i] == max_definition_level) {
+  valid_bits_writer.Set();
+} else if (max_repetition_level > 0) {
+  // repetition+flat case
+  if (def_levels[i] == (max_definition_level - 1)) {
+valid_bits_writer.Clear();
+*null_count += 1;
+  } else {
+continue;
+  }
+} else {
+  // non-repeated+nested case
+  if (def_levels[i] < max_definition_level) {
+valid_bits_writer.Clear();
+*null_count += 1;
+  } else {
+throw ParquetException("definition level exceeds maximum");
+  }
+}
+
+valid_bits_writer.Next();
+  }
+  valid_bits_writer.Finish();
+  *values_read = valid_bits_writer.position();
+}
+#endif
+
+template 
+void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t 
num_def_levels,
+  const int16_t required_definition_level,
+  int64_t* values_read, int64_t* null_count,
+  uint8_t* valid_bits, int64_t 
valid_bits_offset) {
+  constexpr int64_t kBitMaskSize = 64;
+  int64_t set_count = 0;
+  *values_read = 0;
+  while (num_def_levels > 0) {
+int64_t batch_size = std::min(num_def_levels, kBitMaskSize);
+CheckLevelRange(def_levels, batch_size, required_definition_level);
+uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, 
batch_size,
+  
required_definition_level - 1);
+if (has_repeated_parent) {
+  // This is currently a specialized code path assuming only (nested) lists
+  // present through the leaf (i.e. no structs).
+  // Upper level code only calls this method
+  // when the leaf-values are nullable (otherwise no spacing is needed),
+  // Because only nested lists exists it is sufficient to know that the 
field
+  // was either null or included it (i.e. >= previous definition level -> 
> previous
+  // definition - 1). If there where structs mixed in, we need to know the 
def_level
+  // of the repeated parent so we can check for def_level > "def level of 
repeated
+  // parent".
+  uint64_t present_bitmap = internal::GreaterThanBitmap(
+  def_levels, batch_size, required_definition_level - 2);
+  *values_read += internal::AppendSelectedBitsToValidityBitmap(
+  /*new_bits=*/defined_bitmap,
+  /*selection_bitmap*/ present_bitmap, valid_bits, &valid_bits_offset,
+  &set_count);
+} else {
+  internal::AppendToValidityBitmap(
+  /*new_bits=*/defined_bitmap,
+  /*new_bit_count=*/batch_size, valid_bits, &valid_bits_offset, 
&set_count);
+  *values_read += batch_size;
+}
+def_levels += batch_size;
+num_def_levels -= batch_size;
+  }
+  *null_count += *values_read - set_count;
+}
+
+inline void DefinitionLevelsToBitmapDispatch(
+const int16_t* def_levels, int64_t num_def_levels, const int16_t 
max_definition_level,
+const int16_t max_repetition_level, int64_t* values_read, int64_t* 
null_count,
+uint8_t* valid_bits, int64_t valid_bits_offset) {
+  if (max_repetition_level > 0) {
+#if ARROW_LITTLE_ENDIAN
+// we use AVx2 as a proxy for BMI2 instruction set, since there doesn't seem 
to be a clean
+// way o de

[GitHub] [arrow] emkornfield commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column

2020-04-22 Thread GitBox



emkornfield commented on a change in pull request #6985:
URL: https://github.com/apache/arrow/pull/6985#discussion_r413530683



##
File path: cpp/src/arrow/util/bit_util.h
##
@@ -43,13 +43,18 @@
 
 #if defined(_MSC_VER)
 #include 
+#include 
 #pragma intrinsic(_BitScanReverse)
 #pragma intrinsic(_BitScanForward)
 #define ARROW_BYTE_SWAP64 _byteswap_uint64
 #define ARROW_BYTE_SWAP32 _byteswap_ulong
+#define ARROW_POPCOUNT64 __popcnt64
+#define ARROW_POPCOUNT32 __popcnt
 #else
 #define ARROW_BYTE_SWAP64 __builtin_bswap64
 #define ARROW_BYTE_SWAP32 __builtin_bswap32
+#define ARROW_POPCOUNT64 __builtin_popcountll
+#define ARROW_POPCOUNT32 __builtin_popcount

Review comment:
   Yep, so we'll be able to use it by 2030 hopefully? :)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7014: ARROW-8563: GO Minor change to make newBuilder public

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7014:
URL: https://github.com/apache/arrow/pull/7014#issuecomment-618187424


   https://issues.apache.org/jira/browse/ARROW-8563



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] mrkn commented on issue #6667: ARROW-8162: [Format][Python] Add serialization for CSF sparse tensors to Python

2020-04-22 Thread GitBox



mrkn commented on issue #6667:
URL: https://github.com/apache/arrow/pull/6667#issuecomment-618185887


   @rok Thank you for working this!  I'll merge this.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] houqp commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns

2020-04-22 Thread GitBox



houqp commented on issue #7009:
URL: https://github.com/apache/arrow/pull/7009#issuecomment-618158794


   @nevi-me rebased and tests are passing now :)



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] xuancong84 opened a new issue #7017: suggestion: why not serialize complex numbers in a Python list/dict/set

2020-04-22 Thread GitBox



xuancong84 opened a new issue #7017:
URL: https://github.com/apache/arrow/issues/7017


   Dear developers, I realize that complex numbers in Numpy arrays and Pandas 
dataframe/series can be serialized, but complex numbers in Python structures 
(e.g., `[1, 2.5, 3+1.j, np.nan]`) cannot be serialized. Why not make it 
consistent? Or is there some other reasons to concern? Thanks!



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #6985:
URL: https://github.com/apache/arrow/pull/6985#discussion_r413428461



##
File path: cpp/src/arrow/util/bit_util.h
##
@@ -43,13 +43,18 @@
 
 #if defined(_MSC_VER)
 #include 
+#include 
 #pragma intrinsic(_BitScanReverse)
 #pragma intrinsic(_BitScanForward)
 #define ARROW_BYTE_SWAP64 _byteswap_uint64
 #define ARROW_BYTE_SWAP32 _byteswap_ulong
+#define ARROW_POPCOUNT64 __popcnt64
+#define ARROW_POPCOUNT32 __popcnt
 #else
 #define ARROW_BYTE_SWAP64 __builtin_bswap64
 #define ARROW_BYTE_SWAP32 __builtin_bswap32
+#define ARROW_POPCOUNT64 __builtin_popcountll
+#define ARROW_POPCOUNT32 __builtin_popcount

Review comment:
   TIL, C++20 has std::popcount.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm commented on issue #7011: ARROW-8554: [C++][Benchmark] Fix building error "cannot bind lvalue"

2020-04-22 Thread GitBox



wesm commented on issue #7011:
URL: https://github.com/apache/arrow/pull/7011#issuecomment-618111873


   manylinux1 uses gcc 4.8 but does not build the benchmarks. 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on issue #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"

2020-04-22 Thread GitBox



fsaintjacques commented on issue #7011:
URL: https://github.com/apache/arrow/pull/7011#issuecomment-61869


   I understood that @bkietz added an entry for this, maybe it doesn't have the 
benchmark enabled?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on issue #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



fsaintjacques commented on issue #7000:
URL: https://github.com/apache/arrow/pull/7000#issuecomment-618110604


   Addressed most comments and updated followup ticket with what's missing. 
PTAL and merge quickly so we can unblock the blocked tickets :)



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm commented on issue #5947: ARROW-7300: [C++][Gandiva] Implement functions to cast from strings to integers/floats

2020-04-22 Thread GitBox



wesm commented on issue #5947:
URL: https://github.com/apache/arrow/pull/5947#issuecomment-618107435


   @praveenbingo @projjal would you be able to take a look now?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm commented on issue #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"

2020-04-22 Thread GitBox



wesm commented on issue #7011:
URL: https://github.com/apache/arrow/pull/7011#issuecomment-618102656


   Another gcc 4.8 issue here. We may need a more comprehensive build that also 
builds the benchmark executables



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on issue #6972: ARROW-8287: [Rust] Add "pretty" util to help with printing tabular output of RecordBatches

2020-04-22 Thread GitBox



andygrove commented on issue #6972:
URL: https://github.com/apache/arrow/pull/6972#issuecomment-618100341


   Thanks @markhildreth for the detailed write-up in the JIRA! I've started 
looking through this. 
   
   I'm not sure I understand all the points you made yet, but if there is a way 
to just expose the print_batches method and keep the pretty table details 
internal to utils then we should do that.
   
   I also think it is fine for the moment to ignore the use case where the 
schema varies between record batches and file a separate issue for that.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on issue #4140: ARROW-5123: [Rust] Parquet derive for simple structs

2020-04-22 Thread GitBox



andygrove commented on issue #4140:
URL: https://github.com/apache/arrow/pull/4140#issuecomment-618098211


   @bryantbiggs I will take a look at the release tag issue this weekend.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r413408444



##
File path: rust/arrow/src/array/mod.rs
##
@@ -85,6 +85,7 @@ mod array;
 mod builder;
 mod data;
 mod equal;
+mod union;

Review comment:
   I agree that it makes sense to have submodules under array.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r413408241



##
File path: rust/arrow/src/array/equal.rs
##
@@ -1046,6 +1062,30 @@ impl PartialEq for Value {
 }
 }
 
+impl JsonEqual for UnionArray {
+fn equals_json(&self, _json: &[&Value]) -> bool {
+unimplemented!()
+}
+}
+
+impl PartialEq for UnionArray {
+fn eq(&self, json: &Value) -> bool {
+match json {
+Value::Array(json_array) => self.equals_json_values(&json_array),
+_ => false,
+}
+}
+}
+
+impl PartialEq for Value {
+fn eq(&self, arrow: &UnionArray) -> bool {
+match self {

Review comment:
   nit: these two `eq` methods are very similar, but in one, you match on 
`self` and then compare to the argument and in the other one, this is 
transposed. It would be nice to make these consistent.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r413407658



##
File path: rust/arrow/src/array/equal.rs
##
@@ -692,6 +692,22 @@ impl ArrayEqual for StructArray {
 }
 }
 
+impl ArrayEqual for UnionArray {
+fn equals(&self, _other: &dyn Array) -> bool {
+unimplemented!()
+}
+
+fn range_equals(
+&self,
+_other: &dyn Array,
+_start_idx: usize,
+_end_idx: usize,
+_other_start_idx: usize,
+) -> bool {
+unimplemented!()

Review comment:
   Is this intentionally unimplemented? If so, would be good to add a 
comment explaining that.

##
File path: rust/arrow/src/array/equal.rs
##
@@ -1046,6 +1062,30 @@ impl PartialEq for Value {
 }
 }
 
+impl JsonEqual for UnionArray {
+fn equals_json(&self, _json: &[&Value]) -> bool {
+unimplemented!()

Review comment:
   Is this intentionally unimplemented? If so, would be good to add a 
comment explaining that.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r413407264



##
File path: rust/arrow/src/array/equal.rs
##
@@ -692,6 +692,22 @@ impl ArrayEqual for StructArray {
 }
 }
 
+impl ArrayEqual for UnionArray {
+fn equals(&self, _other: &dyn Array) -> bool {
+unimplemented!()

Review comment:
   Is this intentionally unimplemented? If so, would be good to add a 
comment explaining that.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #6770:
URL: https://github.com/apache/arrow/pull/6770#discussion_r413404556



##
File path: rust/datafusion/src/utils.rs
##
@@ -120,6 +143,7 @@ pub fn array_value_to_string(column: array::ArrayRef, row: 
usize) -> Result {
 make_string!(array::Time64NanosecondArray, column, row)
 }
+DataType::List(_) => make_string_from_list!(column, row),

Review comment:
   Does it matter what data type the list is? Can lists of any type be 
converted into strings?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #6770:
URL: https://github.com/apache/arrow/pull/6770#discussion_r413404215



##
File path: rust/datafusion/src/utils.rs
##
@@ -74,6 +74,29 @@ macro_rules! make_string {
 }};
 }
 
+macro_rules! make_string_from_list {
+($column: ident, $row: ident) => {{
+let list = $column
+.as_any()
+.downcast_ref::()
+.unwrap()
+.value($row);
+let string_values = match (0..list.len())
+.map(|i| array_value_to_string(list.clone(), i))
+.collect::>>()
+{
+Ok(values) => values,
+_ => {
+return Err(ExecutionError::ExecutionError(format!(
+"Unsupported {:?} type for repl.",

Review comment:
   What does `repl` mean here? Maybe this error could be a bit more 
descriptive?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #6770:
URL: https://github.com/apache/arrow/pull/6770#discussion_r413403760



##
File path: rust/datafusion/src/utils.rs
##
@@ -74,6 +74,29 @@ macro_rules! make_string {
 }};
 }
 
+macro_rules! make_string_from_list {
+($column: ident, $row: ident) => {{
+let list = $column
+.as_any()
+.downcast_ref::()
+.unwrap()

Review comment:
   `expect` would be better than `unwrap`. Using a `Result` would be better 
than `expect`.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] andygrove commented on a change in pull request #6770: ARROW-7842: [Rust] [Parquet] implement array_reader for list type columns

2020-04-22 Thread GitBox



andygrove commented on a change in pull request #6770:
URL: https://github.com/apache/arrow/pull/6770#discussion_r413403453



##
File path: rust/datafusion/src/logicalplan.rs
##
@@ -828,8 +828,8 @@ mod tests {
 .build()?;
 
 let expected = "Projection: #id\
-\n  Selection: #state Eq Utf8(\"CO\")\
-\nTableScan: employee.csv projection=Some([0, 3])";
+\n  Selection: #state Eq Utf8(\"CO\")\
+\nTableScan: employee.csv projection=Some([0, 3])";
 

Review comment:
   Are these formatting changes from running rustfmt?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] mcassels commented on issue #6770: ARROW-7842 [Parquet][Rust] implement array_reader for list type columns

2020-04-22 Thread GitBox



mcassels commented on issue #6770:
URL: https://github.com/apache/arrow/pull/6770#issuecomment-618082896


   @andygrove @nevi-me do you have any thoughts on this?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kou commented on issue #6996: ARROW-8538: [Packaging] Remove boost from homebrew formula

2020-04-22 Thread GitBox



kou commented on issue #6996:
URL: https://github.com/apache/arrow/pull/6996#issuecomment-618066972


   Could you make a JIRA?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] nealrichardson commented on issue #6996: ARROW-8538: [Packaging] Remove boost from homebrew formula

2020-04-22 Thread GitBox



nealrichardson commented on issue #6996:
URL: https://github.com/apache/arrow/pull/6996#issuecomment-618053499


   @kou are you adding `osx_image: xcode11.3` somewhere already or should I 
make a JIRA?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7016:
URL: https://github.com/apache/arrow/pull/7016#issuecomment-618051647


   https://issues.apache.org/jira/browse/ARROW-8561



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7016:
URL: https://github.com/apache/arrow/pull/7016#issuecomment-618049644


   Revision: e6ed98341efbd0f7bfed30a7aaf12935afb85fa5
   
   Submitted crossbow builds: [ursa-labs/crossbow @ 
actions-164](https://github.com/ursa-labs/crossbow/branches/all?query=actions-164)
   
   |Task|Status|
   ||--|
   
|gandiva-jar-osx|[![TravisCI](https://img.shields.io/travis/ursa-labs/crossbow/actions-164-travis-gandiva-jar-osx.svg)](https://travis-ci.org/ursa-labs/crossbow/branches)|
   
|gandiva-jar-xenial|[![TravisCI](https://img.shields.io/travis/ursa-labs/crossbow/actions-164-travis-gandiva-jar-xenial.svg)](https://travis-ci.org/ursa-labs/crossbow/branches)|



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kou commented on issue #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()

2020-04-22 Thread GitBox



kou commented on issue #7016:
URL: https://github.com/apache/arrow/pull/7016#issuecomment-618049071


   @github-actions crossbow submit -g gandiva



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kou opened a new pull request #7016: ARROW-8561: [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()

2020-04-22 Thread GitBox



kou opened a new pull request #7016:
URL: https://github.com/apache/arrow/pull/7016


   ByteSize() is deprecated and ByteSizeLong() is added since Protobuf 3.4.0.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm commented on issue #7001: Use lowercase ws2_32 everywhere

2020-04-22 Thread GitBox



wesm commented on issue #7001:
URL: https://github.com/apache/arrow/pull/7001#issuecomment-618021001


   @davidanthoff would you mind opening a JIRA issue for this and updating the 
PR title?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7015: ARROW-8560: [Rust] Docs for MutableBuffer resize are incorrect

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7015:
URL: https://github.com/apache/arrow/pull/7015#issuecomment-618016015


   https://issues.apache.org/jira/browse/ARROW-8560



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] paddyhoran opened a new pull request #7015: ARROW-8560: [Rust] Docs for MutableBuffer resize are incorrect

2020-04-22 Thread GitBox



paddyhoran opened a new pull request #7015:
URL: https://github.com/apache/arrow/pull/7015


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm commented on issue #6981: PARQUET-1845: [C++] Add expected results of Int96 in big-endian

2020-04-22 Thread GitBox



wesm commented on issue #6981:
URL: https://github.com/apache/arrow/pull/6981#issuecomment-617947022


   I haven't had a chance to look in detail. Perhaps someone on the Parquet 
mailing list might be able to help



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kiszk commented on issue #6981: PARQUET-1845: [C++] Add expected results of Int96 in big-endian

2020-04-22 Thread GitBox



kiszk commented on issue #6981:
URL: https://github.com/apache/arrow/pull/6981#issuecomment-617935417


   @wesm do you have any comment on this change?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] sunchao commented on a change in pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)

2020-04-22 Thread GitBox



sunchao commented on a change in pull request #6949:
URL: https://github.com/apache/arrow/pull/6949#discussion_r413158295



##
File path: rust/parquet/src/util/io.rs
##
@@ -31,47 +33,83 @@ pub trait Position {
 }
 
 /// Struct that represents a slice of a file data with independent start 
position and
-/// length. Internally clones provided file handle, wraps with BufReader and 
resets
-/// position before any read.
+/// length. Internally clones provided file handle, wraps with a custom 
implementation
+/// of BufReader that resets position before any read.
 ///
 /// This is workaround and alternative for `file.try_clone()` method. It 
clones `File`
 /// while preserving independent position, which is not available with 
`try_clone()`.
 ///
-/// Designed after `arrow::io::RandomAccessFile`.
+/// Designed after `arrow::io::RandomAccessFile` and `std::io::BufReader`
 pub struct FileSource {
-reader: Mutex>,
-start: u64, // start position in a file
-end: u64,   // end position in a file
+reader: RefCell,
+start: u64, // start position in a file
+end: u64,   // end position in a file
+buf: Vec,   // the internal buffer `buf` in BufReader
+buf_pos: usize, // equivalent to the `pos` param in BufReader

Review comment:
   This is not super useful as it requires ppl to jump into `BufReader` to 
check the documentation - maybe add additional comments on what these two 
fields are for?

##
File path: rust/parquet/src/util/io.rs
##
@@ -31,47 +33,83 @@ pub trait Position {
 }
 
 /// Struct that represents a slice of a file data with independent start 
position and
-/// length. Internally clones provided file handle, wraps with BufReader and 
resets
-/// position before any read.
+/// length. Internally clones provided file handle, wraps with a custom 
implementation
+/// of BufReader that resets position before any read.
 ///
 /// This is workaround and alternative for `file.try_clone()` method. It 
clones `File`
 /// while preserving independent position, which is not available with 
`try_clone()`.
 ///
-/// Designed after `arrow::io::RandomAccessFile`.
+/// Designed after `arrow::io::RandomAccessFile` and `std::io::BufReader`
 pub struct FileSource {
-reader: Mutex>,
-start: u64, // start position in a file
-end: u64,   // end position in a file
+reader: RefCell,
+start: u64, // start position in a file
+end: u64,   // end position in a file
+buf: Vec,   // the internal buffer `buf` in BufReader
+buf_pos: usize, // equivalent to the `pos` param in BufReader
+buf_cap: usize, // equivalent to the `cap` param in BufReader
 }
 
 impl FileSource {
 /// Creates new file reader with start and length from a file handle
 pub fn new(fd: &R, start: u64, length: usize) -> Self {
+let reader = RefCell::new(fd.try_clone().unwrap());
 Self {
-reader: Mutex::new(BufReader::new(fd.try_clone().unwrap())),
+reader,
 start,
 end: start + length as u64,
+buf: vec![0 as u8; DEFAULT_BUF_SIZE],
+buf_pos: 0,
+buf_cap: 0,
 }
 }
+
+// inspired from BufReader
+fn fill_buf(&mut self) -> Result<&[u8]> {
+if self.buf_pos >= self.buf_cap {
+// If we've reached the end of our internal buffer then we need to 
fetch
+// some more data from the underlying reader.
+// Branch using `>=` instead of the more correct `==`
+// to tell the compiler that the pos..cap slice is always valid.
+debug_assert!(self.buf_pos == self.buf_cap);
+let mut reader = self.reader.borrow_mut();
+reader.seek(SeekFrom::Start(self.start))?; // always seek to start 
before reading
+self.buf_cap = reader.read(&mut self.buf)?;
+self.buf_pos = 0;
+}
+Ok(&self.buf[self.buf_pos..self.buf_cap])
+}
 }
 
 impl Read for FileSource {
 fn read(&mut self, buf: &mut [u8]) -> Result {
-let mut reader = self
-.reader
-.lock()
-.map_err(|err| Error::new(ErrorKind::Other, err.to_string()))?;
-
 let bytes_to_read = cmp::min(buf.len(), (self.end - self.start) as 
usize);
 let buf = &mut buf[0..bytes_to_read];
 
-reader.seek(SeekFrom::Start(self.start as u64))?;
-let res = reader.read(buf);
-if let Ok(bytes_read) = res {
-self.start += bytes_read as u64;
+// If we don't have any buffered data and we're doing a massive read
+// (larger than our internal buffer), bypass our internal buffer
+// entirely.
+if self.buf_pos == self.buf_cap && buf.len() >= self.buf.len() {
+// discard buffer
+self.buf_pos = 0;
+self.buf_cap = 0;
+// read directly into param buffer
+let mut reader = self.reader.borrow_mut();

Review comment:
   nit: m

[GitHub] [arrow] github-actions[bot] commented on issue #6995: ARROW-8549: [R] Assorted post-0.17 release cleanups

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #6995:
URL: https://github.com/apache/arrow/pull/6995#issuecomment-617926769


   Revision: 61e54eb79986e3946e15c373c76e3eee4294dd44
   
   Submitted crossbow builds: [ursa-labs/crossbow @ 
actions-163](https://github.com/ursa-labs/crossbow/branches/all?query=actions-163)
   
   |Task|Status|
   ||--|
   
|test-conda-r-3.6|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-163-circle-test-conda-r-3.6.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-163-circle-test-conda-r-3.6)|
   
|test-ubuntu-18.04-r-3.6|[![CircleCI](https://img.shields.io/circleci/build/github/ursa-labs/crossbow/actions-163-circle-test-ubuntu-18.04-r-3.6.svg)](https://circleci.com/gh/ursa-labs/crossbow/tree/actions-163-circle-test-ubuntu-18.04-r-3.6)|



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] nealrichardson commented on issue #6995: ARROW-8549: [R] Assorted post-0.17 release cleanups

2020-04-22 Thread GitBox



nealrichardson commented on issue #6995:
URL: https://github.com/apache/arrow/pull/6995#issuecomment-617926040


   @github-actions crossbow submit test-conda-r-3.6 test-ubuntu-18.04-r-3.6



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] houqp commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns

2020-04-22 Thread GitBox



houqp commented on issue #7009:
URL: https://github.com/apache/arrow/pull/7009#issuecomment-617914513


   Thanks @nevi-me for the fix, looks like your PR has been approved. I will 
wait for the merge.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #7000:
URL: https://github.com/apache/arrow/pull/7000#discussion_r413164872



##
File path: python/pyarrow/_dataset.pyx
##
@@ -519,30 +500,69 @@ cdef class Fragment:
 """
 return Expression.wrap(self.fragment.partition_expression())
 
-def to_table(self, use_threads=True, MemoryPool memory_pool=None):
-"""Convert this Fragment into a Table.
+def _scanner(self, **kwargs):
+return Scanner.from_fragment(self, **kwargs)
 
-Use this convenience utility with care. This will serially materialize
-the Scan result in memory before creating the Table.
+def scan(self, columns=None, filter=None, use_threads=True,
+ MemoryPool memory_pool=None, **kwargs):
+"""Builds a scan operation against the dataset.
+

Review comment:
   Yes, but you can still pass schema (which is not documented. I'll add 
this.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7014: Go: Minor change to make newBuilder public to aid upstream

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7014:
URL: https://github.com/apache/arrow/pull/7014#issuecomment-617881461


   
   
   Thanks for opening a pull request!
   
   Could you open an issue for this pull request on JIRA?
   https://issues.apache.org/jira/browse/ARROW
   
   Then could you also rename pull request title in the following format?
   
   ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY}
   
   See also:
   
 * [Other pull requests](https://github.com/apache/arrow/pulls/)
 * [Contribution Guidelines - How to contribute 
patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches)
   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] mindhash opened a new pull request #7014: Go: Minor change to make newBuilder public to aid upstream

2020-04-22 Thread GitBox



mindhash opened a new pull request #7014:
URL: https://github.com/apache/arrow/pull/7014


   Hello team,
   This minor change makes newBuilder() public to reduce verbosity in upstream. 
   
   To give you example, I am working on a parquet read / write into Arrow 
Record batch where the parquet data types are mapped to Arrow data types.  
   My Repo: https://github.com/mindhash/arrow-parquet-go
   
   In such cases, it will be nice to have a builder API (newBuilder) be generic 
to accept a data type and return a respective array.
   
   I am looking at a similar situation for JSON reader. I think this change 
will make the builder API much easier for upstream as well as internal 
packages. 
   
   
   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kszucs commented on issue #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



kszucs commented on issue #7000:
URL: https://github.com/apache/arrow/pull/7000#issuecomment-617868408


   @ursabot build



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files

2020-04-22 Thread GitBox



pitrou commented on a change in pull request #6954:
URL: https://github.com/apache/arrow/pull/6954#discussion_r413092039



##
File path: cpp/src/arrow/util/hash_util.h
##
@@ -27,39 +27,27 @@
 
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
-#include "arrow/util/neon_util.h"
-#include "arrow/util/sse_util.h"
+#include "arrow/util/simd.h"
 
-static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) {
-  DCHECK(false) << "Hardware CRC support is not enabled";
-  return 0;
-}
-
-static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) {
-  DCHECK(false) << "Hardware CRC support is not enabled";
-  return 0;
-}
-
-static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) {
-  DCHECK(false) << "Hardware CRC support is not enabled";
-  return 0;
-}
-
-static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) {
+#ifdef ARROW_HAVE_SSE4_2

Review comment:
   Also, regardless of whether we move them or not, you should put them in 
the `arrow::internal` namespace.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kiszk commented on issue #6954: ARROW-8440: [C++] Refine SIMD header files

2020-04-22 Thread GitBox



kiszk commented on issue #6954:
URL: https://github.com/apache/arrow/pull/6954#issuecomment-617854855


   Is the function `Armv8CrcHashParallel` uses somewhere? Sorry if I overlook 
it.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kiszk edited a comment on issue #6954: ARROW-8440: [C++] Refine SIMD header files

2020-04-22 Thread GitBox



kiszk edited a comment on issue #6954:
URL: https://github.com/apache/arrow/pull/6954#issuecomment-617854855


   Is the function `Armv8CrcHashParallel` used somewhere? Sorry if I overlook 
it.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #7000:
URL: https://github.com/apache/arrow/pull/7000#discussion_r413074200



##
File path: cpp/src/arrow/dataset/dataset.cc
##
@@ -30,34 +30,40 @@
 namespace arrow {
 namespace dataset {
 
-Fragment::Fragment(std::shared_ptr scan_options)
-: scan_options_(std::move(scan_options)), 
partition_expression_(scalar(true)) {}
+Fragment::Fragment(std::shared_ptr partition_expression)
+: partition_expression_(partition_expression ? partition_expression : 
scalar(true)) {}
 
-const std::shared_ptr& Fragment::schema() const {
-  return scan_options_->schema();
-}
+Result> InMemoryFragment::ReadPhysicalSchema() {

Review comment:
   The effort to open a JIRA is equal to implementing it. I added it.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm commented on issue #6982: ARROW-8514: [Developer][Release] Verify Python 3.5 Windows wheel

2020-04-22 Thread GitBox



wesm commented on issue #6982:
URL: https://github.com/apache/arrow/pull/6982#issuecomment-617827466


   +1



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm commented on issue #7003: from pyarrow import parquet fails with AttributeError: type object 'pyarrow._parquet.Statistics' has no attribute '__reduce_cython__'

2020-04-22 Thread GitBox



wesm commented on issue #7003:
URL: https://github.com/apache/arrow/issues/7003#issuecomment-617827061


   Can you please open a JIRA issue and provide instructions to reproduce? 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7012: ARROW-8555: [FlightRPC][Java] implement DoExchange

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7012:
URL: https://github.com/apache/arrow/pull/7012#issuecomment-617824864


   https://issues.apache.org/jira/browse/ARROW-8555



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7013: ARROW-8512: [C++] Remove unused expression/operator prototype code

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7013:
URL: https://github.com/apache/arrow/pull/7013#issuecomment-617824867


   https://issues.apache.org/jira/browse/ARROW-8512



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #7000:
URL: https://github.com/apache/arrow/pull/7000#discussion_r413045582



##
File path: cpp/src/arrow/dataset/file_base.cc
##
@@ -222,9 +214,8 @@ FragmentIterator FileSystemDataset::GetFragmentsImpl(
 }
 
 Result> FileSystemDataset::Write(
-const WritePlan& plan, std::shared_ptr scan_context) {
-  std::vector> options(plan.paths.size());
-
+const WritePlan& plan, std::shared_ptr scan_options,

Review comment:
   That's a task for 
[ARROW-8382](https://jira.apache.org/jira/projects/ARROW/issues/ARROW-8382). 
The writer shouldn't be aware of such issue, it takes a file source, a format, 
a RecordBatchReader (the dropping should happen internally here) and writes it.
   
   Since the Writer API has no bindings and not in use yet, I added this to 
ARROW-8382 so we can prioritise  ARROW-8062 (parquet dataset) and ARROW-8318 
(fragment stored in dataset).





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #7000:
URL: https://github.com/apache/arrow/pull/7000#discussion_r413046083



##
File path: cpp/src/arrow/dataset/dataset.h
##
@@ -100,16 +97,20 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
   RecordBatchVector record_batches_;
 };
 
-/// \brief A container of zero or more Fragments. A Dataset acts as a 
discovery mechanism
-/// of Fragments and partitions, e.g. files deeply nested in a directory.
+/// \brief A container of zero or more Fragments.
+///
+/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
+/// directory. A Dataset has a schema, also known as the "reader" schema.
+///
 class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this {
  public:
   /// \brief Begin to build a new Scan operation against this Dataset
   Result> NewScan(std::shared_ptr 
context);
   Result> NewScan();
 
-  /// \brief GetFragments returns an iterator of Fragments given ScanOptions.
-  FragmentIterator GetFragments(std::shared_ptr options);
+  /// \brief GetFragments returns an iterator of Fragments given a predicate.
+  FragmentIterator GetFragments(std::shared_ptr predicate);
+  FragmentIterator GetFragments();

Review comment:
   Ditto.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #7000:
URL: https://github.com/apache/arrow/pull/7000#discussion_r413045582



##
File path: cpp/src/arrow/dataset/file_base.cc
##
@@ -222,9 +214,8 @@ FragmentIterator FileSystemDataset::GetFragmentsImpl(
 }
 
 Result> FileSystemDataset::Write(
-const WritePlan& plan, std::shared_ptr scan_context) {
-  std::vector> options(plan.paths.size());
-
+const WritePlan& plan, std::shared_ptr scan_options,

Review comment:
   That's a task for 
[ARROW-8382](https://jira.apache.org/jira/projects/ARROW/issues/ARROW-8382). 
The writer shouldn't be aware of such issue, it takes a file source, a format, 
a RecordBatchReader (the dropping should happen internally here) and writes it.
   
   Since the Writer API has no bindings and not in used, I added this to 
ARROW-8382 so we can prioritise  ARROW-8062 (parquet dataset) and ARROW-8318 
(fragment stored in dataset).





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wesm opened a new pull request #7013: ARROW-8512: [C++] Remove unused expression/operator prototype code

2020-04-22 Thread GitBox



wesm opened a new pull request #7013:
URL: https://github.com/apache/arrow/pull/7013


   None of this code was ever used. 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] lidavidm opened a new pull request #7012: ARROW-8555: [FlightRPC][Java] implement DoExchange

2020-04-22 Thread GitBox



lidavidm opened a new pull request #7012:
URL: https://github.com/apache/arrow/pull/7012


   This is a complete implementation of DoExchange for Java. It is not tested 
against the C++ implementation yet, however, it still passes integration tests, 
so the internal refactoring should not have broken compatibility with existing 
clients/servers.
   
   In this PR, I've refactored DoGet/DoPut/DoExchange on the client and server 
to share their implementation as much as possible. DoGet/DoPut retain their 
behavior of "eagerly" reading/writing schemas, but DoExchange allows the 
client/server to delay writing the schema until ready. This is checked in the 
unit tests.
   
   I also ran into some test flakes and tried to address them, by making sure 
we clean up things in the right order, and adding missing `close()` calls in 
some existing tests.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kiszk commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files

2020-04-22 Thread GitBox



kiszk commented on a change in pull request #6954:
URL: https://github.com/apache/arrow/pull/6954#discussion_r413030123



##
File path: cpp/src/arrow/util/hash_util.h
##
@@ -27,39 +27,27 @@
 
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
-#include "arrow/util/neon_util.h"
-#include "arrow/util/sse_util.h"
+#include "arrow/util/simd.h"
 
-static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) {
-  DCHECK(false) << "Hardware CRC support is not enabled";
-  return 0;
-}
-
-static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) {
-  DCHECK(false) << "Hardware CRC support is not enabled";
-  return 0;
-}
-
-static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) {
-  DCHECK(false) << "Hardware CRC support is not enabled";
-  return 0;
-}
-
-static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) {
+#ifdef ARROW_HAVE_SSE4_2

Review comment:
   How about moving this part into `simd.h`?  We can reduce the number of 
places that depends on the target hardware.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7011:
URL: https://github.com/apache/arrow/pull/7011#issuecomment-617806628


   https://issues.apache.org/jira/browse/ARROW-8554



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] lidavidm commented on issue #6656: ARROW-8297: [FlightRPC][C++] Implement Flight DoExchange for C++

2020-04-22 Thread GitBox



lidavidm commented on issue #6656:
URL: https://github.com/apache/arrow/pull/6656#issuecomment-617806080


   This should be ready now. The only issue I have is I could not figure out 
how to get GCC 4.8 to use the unique_ptr overload of a method instead of the 
shared_ptr overload.
   
   
https://github.com/apache/arrow/pull/6656/files#diff-34371f71b4b1eecc5f855dd3dbd3340cR207-R214



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] plusplusjiajia opened a new pull request #7011: ARROW-8554 [C++][Benchmark] Fix building error "cannot bind lvalue"

2020-04-22 Thread GitBox



plusplusjiajia opened a new pull request #7011:
URL: https://github.com/apache/arrow/pull/7011


   https://issues.apache.org/jira/browse/ARROW-8554



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7000: ARROW-8065: [C++][Dataset] Refactor ScanOptions and Fragment relation

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #7000:
URL: https://github.com/apache/arrow/pull/7000#discussion_r413007995



##
File path: cpp/src/arrow/dataset/dataset.h
##
@@ -84,13 +82,12 @@ class ARROW_DS_EXPORT Fragment {
 class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
  public:
   InMemoryFragment(RecordBatchVector record_batches,
-   std::shared_ptr scan_options);
+   std::shared_ptr = NULLPTR);

Review comment:
   Doesn't compile.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kiszk commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column

2020-04-22 Thread GitBox



kiszk commented on a change in pull request #6985:
URL: https://github.com/apache/arrow/pull/6985#discussion_r412990504



##
File path: cpp/src/parquet/column_reader.cc
##
@@ -50,6 +51,141 @@ using arrow::internal::checked_cast;
 
 namespace parquet {
 
+namespace {
+
+inline void CheckLevelRange(const int16_t* levels, int64_t num_levels,
+const int16_t max_expected_level) {
+  int16_t min_level = std::numeric_limits::max();
+  int16_t max_level = std::numeric_limits::min();
+  for (int x = 0; x < num_levels; x++) {
+min_level = std::min(levels[x], min_level);
+max_level = std::max(levels[x], max_level);
+  }
+  if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > 
max_level))) {
+throw ParquetException("definition level exceeds maximum");
+  }
+}
+
+#if !defined(ARROW_HAVE_BMI2)
+
+inline void DefinitionLevelsToBitmapScalar(
+const int16_t* def_levels, int64_t num_def_levels, const int16_t 
max_definition_level,
+const int16_t max_repetition_level, int64_t* values_read, int64_t* 
null_count,
+uint8_t* valid_bits, int64_t valid_bits_offset) {
+  // We assume here that valid_bits is large enough to accommodate the
+  // additional definition levels and the ones that have already been written
+  ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, 
valid_bits_offset,
+num_def_levels);
+
+  // TODO(itaiin): As an interim solution we are splitting the code path here
+  // between repeated+flat column reads, and non-repeated+nested reads.
+  // Those paths need to be merged in the future
+  for (int i = 0; i < num_def_levels; ++i) {
+if (def_levels[i] == max_definition_level) {
+  valid_bits_writer.Set();
+} else if (max_repetition_level > 0) {
+  // repetition+flat case
+  if (def_levels[i] == (max_definition_level - 1)) {
+valid_bits_writer.Clear();
+*null_count += 1;
+  } else {
+continue;
+  }
+} else {
+  // non-repeated+nested case
+  if (def_levels[i] < max_definition_level) {
+valid_bits_writer.Clear();
+*null_count += 1;
+  } else {
+throw ParquetException("definition level exceeds maximum");
+  }
+}
+
+valid_bits_writer.Next();
+  }
+  valid_bits_writer.Finish();
+  *values_read = valid_bits_writer.position();
+}
+#endif
+
+template 
+void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t 
num_def_levels,
+  const int16_t required_definition_level,
+  int64_t* values_read, int64_t* null_count,
+  uint8_t* valid_bits, int64_t 
valid_bits_offset) {
+  constexpr int64_t kBitMaskSize = 64;
+  int64_t set_count = 0;
+  *values_read = 0;
+  while (num_def_levels > 0) {
+int64_t batch_size = std::min(num_def_levels, kBitMaskSize);
+CheckLevelRange(def_levels, batch_size, required_definition_level);
+uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, 
batch_size,
+  
required_definition_level - 1);
+if (has_repeated_parent) {
+  // This is currently a specialized code path assuming only (nested) lists
+  // present through the leaf (i.e. no structs).
+  // Upper level code only calls this method
+  // when the leaf-values are nullable (otherwise no spacing is needed),
+  // Because only nested lists exists it is sufficient to know that the 
field
+  // was either null or included it (i.e. >= previous definition level -> 
> previous
+  // definition - 1). If there where structs mixed in, we need to know the 
def_level
+  // of the repeated parent so we can check for def_level > "def level of 
repeated
+  // parent".
+  uint64_t present_bitmap = internal::GreaterThanBitmap(
+  def_levels, batch_size, required_definition_level - 2);
+  *values_read += internal::AppendSelectedBitsToValidityBitmap(
+  /*new_bits=*/defined_bitmap,
+  /*selection_bitmap*/ present_bitmap, valid_bits, &valid_bits_offset,
+  &set_count);
+} else {
+  internal::AppendToValidityBitmap(
+  /*new_bits=*/defined_bitmap,
+  /*new_bit_count=*/batch_size, valid_bits, &valid_bits_offset, 
&set_count);
+  *values_read += batch_size;
+}
+def_levels += batch_size;
+num_def_levels -= batch_size;
+  }
+  *null_count += *values_read - set_count;
+}
+
+inline void DefinitionLevelsToBitmapDispatch(
+const int16_t* def_levels, int64_t num_def_levels, const int16_t 
max_definition_level,
+const int16_t max_repetition_level, int64_t* values_read, int64_t* 
null_count,
+uint8_t* valid_bits, int64_t valid_bits_offset) {
+  if (max_repetition_level > 0) {
+#if ARROW_LITTLE_ENDIAN
+// we use AVx2 as a proxy for BMI2 instruction set, since there doesn't seem 
to be a clean
+// way o detect t

[GitHub] [arrow] lidavidm commented on issue #6744: PARQUET-1820: [C++] pre-buffer specified columns of row group

2020-04-22 Thread GitBox



lidavidm commented on issue #6744:
URL: https://github.com/apache/arrow/pull/6744#issuecomment-617784720


   Thanks for the review!
   
   > It might be nice to have a convenience for prebuffering an entire row 
group. Something like
   > auto rg = file_reader->RowGroup(i);
   > rg->PreBufferAllColumns();
   
   I think it's not worth it given this is a relatively low-level API, but may 
be handy if you do expect people to be using this directly often (instead of 
via ArrowReaderProperties).



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kiszk commented on a change in pull request #6985: ARROW-8413: [C++][Parquet][WIP] Refactor Generating validity bitmap for values column

2020-04-22 Thread GitBox



kiszk commented on a change in pull request #6985:
URL: https://github.com/apache/arrow/pull/6985#discussion_r412987615



##
File path: cpp/src/parquet/CMakeLists.txt
##
@@ -336,6 +336,7 @@ set_source_files_properties(public_api_test.cc
 add_parquet_test(reader_test
  SOURCES
  column_reader_test.cc
+level_conversion_test.cc

Review comment:
   nit: indentation





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #6744: PARQUET-1820: [C++] pre-buffer specified columns of row group

2020-04-22 Thread GitBox



lidavidm commented on a change in pull request #6744:
URL: https://github.com/apache/arrow/pull/6744#discussion_r412987199



##
File path: cpp/src/parquet/properties.h
##
@@ -56,10 +60,32 @@ class PARQUET_EXPORT ReaderProperties {
 
   bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
 
+  bool is_coalesced_stream_enabled() const { return coalesced_stream_enabled_; 
}
+
   void enable_buffered_stream() { buffered_stream_enabled_ = true; }
 
   void disable_buffered_stream() { buffered_stream_enabled_ = false; }
 
+  /// Enable read coalescing.
+  ///
+  /// When enabled, readers can optionally call
+  /// ParquetFileReader.PreBuffer to cache the necessary slices of the
+  /// file in-memory before deserialization. Arrow readers
+  /// automatically do this. This is intended to increase performance
+  /// when reading from high-latency filesystems (e.g. Amazon S3).
+  ///
+  /// When this is enabled, it is NOT SAFE to concurrently create
+  /// RecordBatchReaders from the same file.

Review comment:
   I've updated the wording - it's not unsafe (in that it won't crash), but 
it'll fail if you try to read a row group/column chunk that wasn't cached. This 
is because there's one shared cache that is not specific to a particular 
reader. Looking through the code, we could make this work better by avoiding a 
PreBuffer method and instead threading the cache all the way from the Arrow 
readers to the Parquet implementation. I think that's possible, but would 
require passing the cache between several classes/methods - do you think that's 
worth it?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #6744: PARQUET-1820: [C++] pre-buffer specified columns of row group

2020-04-22 Thread GitBox



lidavidm commented on a change in pull request #6744:
URL: https://github.com/apache/arrow/pull/6744#discussion_r412985177



##
File path: cpp/src/parquet/file_reader.h
##
@@ -117,6 +117,15 @@ class PARQUET_EXPORT ParquetFileReader {
   // Returns the file metadata. Only one instance is ever created
   std::shared_ptr metadata() const;
 
+  /// Pre-buffer the specified column indices in all row groups.
+  ///
+  /// Only has an effect if ReaderProperties.is_coalesced_stream_enabled is 
set;
+  /// otherwise this is a no-op. The reader internally maintains a cache which 
is
+  /// overwritten each time this is called. Intended to increase performance on
+  /// high-latency filesystems (e.g. Amazon S3).
+  void PreBuffer(const std::vector& row_groups,
+ const std::vector& column_indices);

Review comment:
   I changed this to accept the CacheOptions struct here, which gives the 
caller full control each time they invoke it.

##
File path: cpp/src/parquet/file_reader.h
##
@@ -117,6 +117,15 @@ class PARQUET_EXPORT ParquetFileReader {
   // Returns the file metadata. Only one instance is ever created
   std::shared_ptr metadata() const;
 
+  /// Pre-buffer the specified column indices in all row groups.
+  ///
+  /// Only has an effect if ReaderProperties.is_coalesced_stream_enabled is 
set;

Review comment:
   Yes - that makes more sense, combined with moving the flag to 
ArrowReaderProperties.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] paddyhoran commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



paddyhoran commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r412974468



##
File path: rust/arrow/src/array/union.rs
##
@@ -0,0 +1,1172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Contains the `UnionArray` and `UnionBuilder` types.
+//!
+//! Each slot in a `UnionArray` can have a value chosen from a number of 
types.  Each of the
+//! possible types are named like the fields of a 
[`StructArray`](crate::array::StructArray).
+//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse".  
For more information
+//! on please see the 
[specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout).
+//!
+//! Builders are provided for `UnionArray`'s involving primitive types.  
`UnionArray`'s of nested
+//! types are also supported but not via `UnionBuilder`, see the tests for 
examples.
+//!
+//! # Example: Dense Memory Layout
+//!
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_dense(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 0_i32);
+//! assert_eq!(union.value_offset(2), 1_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Sparse Memory Layout
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_sparse(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 1_i32);
+//! assert_eq!(union.value_offset(2), 2_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+use crate::array::{
+builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, 
BufferBuilderTrait},
+make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef,
+BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder,
+};
+use crate::buffer::{Buffer, MutableBuffer};
+use crate::datatypes::*;
+use crate::error::{ArrowError, Result};
+
+use crate::util::bit_util;
+use core::fmt;
+use std::any::Any;
+use std::collections::HashMap;
+use std::mem::size_of;
+
+/// An Array that can represent slots of varying types
+pub struct UnionArray {
+data: ArrayDataRef,
+boxed_fields: Vec,
+}
+
+impl UnionArray {
+/// Creates a new `UnionArray`.
+///
+/// Accepts type ids, child arrays and optionally offsets (for dense 
unions) to create
+/// a new `UnionArray`.  This method makes no attempt to validate the data 
provided by the
+/// caller and assumes that each of the components are correct and 
consistent with each other.
+/// See `try_new` for an alternative that validates the data provided.
+///
+/// # Data Consistency
+///
+/// The `type_ids` `Buffer` should contain `i8` values.  These values 
should be greater than
+/// zero and must be less than the number of children provided in 
`child_arrays`.  These values
+/// are used to index into the `child_arrays`.
+///
+/// The `value_offsets` `Buffer` should contain `i32` values.  These 
values should be greater

Review comment:
   Yea, I only have a single constructor for both types that takes an 
`Option` for the `value_offsets`.  The comment should point this out, 
I'll update.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to

[GitHub] [arrow] paddyhoran commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



paddyhoran commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r412972235



##
File path: rust/arrow/src/array/union.rs
##
@@ -0,0 +1,1172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Contains the `UnionArray` and `UnionBuilder` types.
+//!
+//! Each slot in a `UnionArray` can have a value chosen from a number of 
types.  Each of the
+//! possible types are named like the fields of a 
[`StructArray`](crate::array::StructArray).
+//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse".  
For more information
+//! on please see the 
[specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout).
+//!
+//! Builders are provided for `UnionArray`'s involving primitive types.  
`UnionArray`'s of nested
+//! types are also supported but not via `UnionBuilder`, see the tests for 
examples.
+//!
+//! # Example: Dense Memory Layout
+//!
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_dense(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 0_i32);
+//! assert_eq!(union.value_offset(2), 1_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Sparse Memory Layout
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_sparse(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 1_i32);
+//! assert_eq!(union.value_offset(2), 2_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+use crate::array::{
+builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, 
BufferBuilderTrait},
+make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef,
+BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder,
+};
+use crate::buffer::{Buffer, MutableBuffer};
+use crate::datatypes::*;
+use crate::error::{ArrowError, Result};
+
+use crate::util::bit_util;
+use core::fmt;
+use std::any::Any;
+use std::collections::HashMap;
+use std::mem::size_of;
+
+/// An Array that can represent slots of varying types
+pub struct UnionArray {
+data: ArrayDataRef,
+boxed_fields: Vec,
+}
+
+impl UnionArray {
+/// Creates a new `UnionArray`.
+///
+/// Accepts type ids, child arrays and optionally offsets (for dense 
unions) to create
+/// a new `UnionArray`.  This method makes no attempt to validate the data 
provided by the
+/// caller and assumes that each of the components are correct and 
consistent with each other.
+/// See `try_new` for an alternative that validates the data provided.
+///
+/// # Data Consistency
+///
+/// The `type_ids` `Buffer` should contain `i8` values.  These values 
should be greater than
+/// zero and must be less than the number of children provided in 
`child_arrays`.  These values

Review comment:
   I was just going but 
[this](https://arrow.apache.org/docs/format/Columnar.html#union-layout).
   
   It's not immediately obvious why you would need that.  I'll have to study 
the C++ implementation.  I'll open a follow up PR.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure

[GitHub] [arrow] pitrou commented on issue #6997: ARROW-8540: [C++] Add memory allocation benchmarks

2020-04-22 Thread GitBox



pitrou commented on issue #6997:
URL: https://github.com/apache/arrow/pull/6997#issuecomment-617760416


   > I tried with multiple threads and the numbers look very similar between 
SystemAlloc and Jemalloc when I would have expected Jemalloc to do better here.
   
   Well, it's a trivial micro-benchmark. But the glibc allocator isn't that bad 
anyway.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] fsaintjacques commented on a change in pull request #6997: ARROW-8540: [C++] Add memory allocation benchmarks

2020-04-22 Thread GitBox



fsaintjacques commented on a change in pull request #6997:
URL: https://github.com/apache/arrow/pull/6997#discussion_r412951352



##
File path: cpp/src/arrow/memory_pool_benchmark.cc
##
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/util/logging.h"
+
+#include "benchmark/benchmark.h"
+
+namespace arrow {
+
+struct SystemAlloc {
+  static Result GetAllocator() { return system_memory_pool(); }
+};
+
+#ifdef ARROW_JEMALLOC
+struct Jemalloc {
+  static Result GetAllocator() {
+MemoryPool* pool;
+RETURN_NOT_OK(jemalloc_memory_pool(&pool));
+return pool;
+  }
+};
+#endif
+
+#ifdef ARROW_MIMALLOC
+struct Mimalloc {
+  static Result GetAllocator() {
+MemoryPool* pool;
+RETURN_NOT_OK(mimalloc_memory_pool(&pool));
+return pool;
+  }
+};
+#endif
+
+static void TouchCacheLines(uint8_t* data, int64_t nbytes) {
+  uint8_t total = 0;
+  while (nbytes > 0) {
+total += *data;
+data += 64;
+nbytes -= 64;
+  }
+  benchmark::DoNotOptimize(total);
+}
+
+// Benchmark the raw cost of allocating memory.
+// Note this is a best case situation: we always allocate and deallocate 
exactly
+// the same size, without any other allocator traffic.  However, it can be
+// representative of workloads where we routinely create and destroy
+// temporary buffers for intermediate computation results.
+template 
+static void AllocateDeallocate(benchmark::State& state) {  // NOLINT non-const 
reference
+  const int64_t nbytes = state.range(0);
+  MemoryPool* pool = *Alloc::GetAllocator();
+
+  for (auto _ : state) {
+uint8_t* data;
+ARROW_CHECK_OK(pool->Allocate(nbytes, &data));
+pool->Free(data, nbytes);
+  }
+}
+
+// Benchmark the cost of allocating memory plus accessing it.
+template 
+static void AllocateTouchDeallocate(
+benchmark::State& state) {  // NOLINT non-const reference
+  const int64_t nbytes = state.range(0);
+  MemoryPool* pool = *Alloc::GetAllocator();
+
+  for (auto _ : state) {
+uint8_t* data;
+ARROW_CHECK_OK(pool->Allocate(nbytes, &data));
+TouchCacheLines(data, nbytes);
+pool->Free(data, nbytes);
+  }
+}
+
+// Benchmark the cost of accessing always the same memory area.
+static void TouchArea(benchmark::State& state) {  // NOLINT non-const reference

Review comment:
   I didn't understand at first, is this because you want to measure the 
difference between `AllocateDeallocate` and `AllocateTouchDeallocate` and this 
test gives you a rough number to validate said gap?
   
   If so, add a comment on top of it and move this as the first test to help 
reading.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on issue #6981: PARQUET-1845: [C++] Add expected results of Int96 in big-endian

2020-04-22 Thread GitBox



pitrou commented on issue #6981:
URL: https://github.com/apache/arrow/pull/6981#issuecomment-617744347


   As a sidenote, I think you may want to start with Arrow unittests before 
trying to make Parquet unittests successful. Parquet relies on many Arrow 
facilities.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #6992: ARROW-7950: [Python] Determine + test minimal pandas version + raise error when pandas is too old

2020-04-22 Thread GitBox



pitrou commented on a change in pull request #6992:
URL: https://github.com/apache/arrow/pull/6992#discussion_r412929769



##
File path: python/pyarrow/tests/test_pandas.py
##
@@ -2685,8 +2685,8 @@ class A:
 'a': pd.period_range('2000-01-01', periods=20),
 })
 
-expected_msg = 'Conversion failed for column a with type period'
-with pytest.raises(TypeError, match=expected_msg):
+expected_msg = 'Conversion failed for column a with type period|object'

Review comment:
   Do you mean `(period|object)`? (parentheses included)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #6992: ARROW-7950: [Python] Determine + test minimal pandas version + raise error when pandas is too old

2020-04-22 Thread GitBox



pitrou commented on a change in pull request #6992:
URL: https://github.com/apache/arrow/pull/6992#discussion_r412929368



##
File path: python/pyarrow/pandas-shim.pxi
##
@@ -55,6 +55,16 @@ cdef class _PandasAPIShim(object):
 from distutils.version import LooseVersion
 self._loose_version = LooseVersion(pd.__version__)
 
+if self._loose_version < LooseVersion('0.23.0'):
+self._have_pandas = False
+if raise_:
+raise ImportError(
+"pyarrow requires pandas 0.23.0 or above, pandas {} is "
+"installed".format(self._version)
+)
+else:
+return

Review comment:
   Could we perhaps emit a warning here? I don't think that users expect 
their Pandas installation to be silently ignored.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



pitrou commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r412926608



##
File path: rust/arrow/src/array/union.rs
##
@@ -0,0 +1,1172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Contains the `UnionArray` and `UnionBuilder` types.
+//!
+//! Each slot in a `UnionArray` can have a value chosen from a number of 
types.  Each of the
+//! possible types are named like the fields of a 
[`StructArray`](crate::array::StructArray).
+//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse".  
For more information
+//! on please see the 
[specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout).
+//!
+//! Builders are provided for `UnionArray`'s involving primitive types.  
`UnionArray`'s of nested
+//! types are also supported but not via `UnionBuilder`, see the tests for 
examples.
+//!
+//! # Example: Dense Memory Layout
+//!
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_dense(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 0_i32);
+//! assert_eq!(union.value_offset(2), 1_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Sparse Memory Layout
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_sparse(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 1_i32);
+//! assert_eq!(union.value_offset(2), 2_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+use crate::array::{
+builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, 
BufferBuilderTrait},
+make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef,
+BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder,
+};
+use crate::buffer::{Buffer, MutableBuffer};
+use crate::datatypes::*;
+use crate::error::{ArrowError, Result};
+
+use crate::util::bit_util;
+use core::fmt;
+use std::any::Any;
+use std::collections::HashMap;
+use std::mem::size_of;
+
+/// An Array that can represent slots of varying types
+pub struct UnionArray {
+data: ArrayDataRef,
+boxed_fields: Vec,
+}
+
+impl UnionArray {
+/// Creates a new `UnionArray`.
+///
+/// Accepts type ids, child arrays and optionally offsets (for dense 
unions) to create
+/// a new `UnionArray`.  This method makes no attempt to validate the data 
provided by the
+/// caller and assumes that each of the components are correct and 
consistent with each other.
+/// See `try_new` for an alternative that validates the data provided.
+///
+/// # Data Consistency
+///
+/// The `type_ids` `Buffer` should contain `i8` values.  These values 
should be greater than
+/// zero and must be less than the number of children provided in 
`child_arrays`.  These values
+/// are used to index into the `child_arrays`.
+///
+/// The `value_offsets` `Buffer` should contain `i32` values.  These 
values should be greater

Review comment:
   I don't know anything about Rust, but I don't see a constructor for 
sparse Union arrays. Sparse Union arrays don't have a offsets buffer.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comm

[GitHub] [arrow] pitrou commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



pitrou commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r412925276



##
File path: rust/arrow/src/array/union.rs
##
@@ -0,0 +1,1172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Contains the `UnionArray` and `UnionBuilder` types.
+//!
+//! Each slot in a `UnionArray` can have a value chosen from a number of 
types.  Each of the
+//! possible types are named like the fields of a 
[`StructArray`](crate::array::StructArray).
+//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse".  
For more information
+//! on please see the 
[specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout).
+//!
+//! Builders are provided for `UnionArray`'s involving primitive types.  
`UnionArray`'s of nested
+//! types are also supported but not via `UnionBuilder`, see the tests for 
examples.
+//!
+//! # Example: Dense Memory Layout
+//!
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_dense(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 0_i32);
+//! assert_eq!(union.value_offset(2), 1_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Sparse Memory Layout
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_sparse(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 1_i32);
+//! assert_eq!(union.value_offset(2), 2_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+use crate::array::{
+builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, 
BufferBuilderTrait},
+make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef,
+BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder,
+};
+use crate::buffer::{Buffer, MutableBuffer};
+use crate::datatypes::*;
+use crate::error::{ArrowError, Result};
+
+use crate::util::bit_util;
+use core::fmt;
+use std::any::Any;
+use std::collections::HashMap;
+use std::mem::size_of;
+
+/// An Array that can represent slots of varying types
+pub struct UnionArray {
+data: ArrayDataRef,
+boxed_fields: Vec,
+}
+
+impl UnionArray {
+/// Creates a new `UnionArray`.
+///
+/// Accepts type ids, child arrays and optionally offsets (for dense 
unions) to create
+/// a new `UnionArray`.  This method makes no attempt to validate the data 
provided by the
+/// caller and assumes that each of the components are correct and 
consistent with each other.
+/// See `try_new` for an alternative that validates the data provided.
+///
+/// # Data Consistency
+///
+/// The `type_ids` `Buffer` should contain `i8` values.  These values 
should be greater than
+/// zero and must be less than the number of children provided in 
`child_arrays`.  These values

Review comment:
   Note that the Union type in C++ and in the IPC format allows you to 
specify logical type ids.
   For example if creating `UnionType({float32, utf8}, type_ids=[5, 7])`, then 
a UnionArray's type ids buffer could be something like `[5, 7, 7]` for values 
such as `[1.5, "foo", "bar"]`.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queri

[GitHub] [arrow] pitrou commented on a change in pull request #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



pitrou commented on a change in pull request #7004:
URL: https://github.com/apache/arrow/pull/7004#discussion_r412925276



##
File path: rust/arrow/src/array/union.rs
##
@@ -0,0 +1,1172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Contains the `UnionArray` and `UnionBuilder` types.
+//!
+//! Each slot in a `UnionArray` can have a value chosen from a number of 
types.  Each of the
+//! possible types are named like the fields of a 
[`StructArray`](crate::array::StructArray).
+//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse".  
For more information
+//! on please see the 
[specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout).
+//!
+//! Builders are provided for `UnionArray`'s involving primitive types.  
`UnionArray`'s of nested
+//! types are also supported but not via `UnionBuilder`, see the tests for 
examples.
+//!
+//! # Example: Dense Memory Layout
+//!
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_dense(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 0_i32);
+//! assert_eq!(union.value_offset(2), 1_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Sparse Memory Layout
+//! ```
+//! use arrow::array::UnionBuilder;
+//! use arrow::datatypes::{Float64Type, Int32Type};
+//!
+//! # fn main() -> arrow::error::Result<()> {
+//! let mut builder = UnionBuilder::new_sparse(3);
+//! builder.append::("a", 1).unwrap();
+//! builder.append::("b", 3.0).unwrap();
+//! builder.append::("a", 4).unwrap();
+//! let union = builder.build().unwrap();
+//!
+//! assert_eq!(union.type_id(0), 0_i8);
+//! assert_eq!(union.type_id(1), 1_i8);
+//! assert_eq!(union.type_id(2), 0_i8);
+//!
+//! assert_eq!(union.value_offset(0), 0_i32);
+//! assert_eq!(union.value_offset(1), 1_i32);
+//! assert_eq!(union.value_offset(2), 2_i32);
+//!
+//! # Ok(())
+//! # }
+//! ```
+use crate::array::{
+builder::{builder_to_mutable_buffer, mutable_buffer_to_builder, 
BufferBuilderTrait},
+make_array, Array, ArrayData, ArrayDataBuilder, ArrayDataRef, ArrayRef,
+BooleanBufferBuilder, BufferBuilder, Int32BufferBuilder, Int8BufferBuilder,
+};
+use crate::buffer::{Buffer, MutableBuffer};
+use crate::datatypes::*;
+use crate::error::{ArrowError, Result};
+
+use crate::util::bit_util;
+use core::fmt;
+use std::any::Any;
+use std::collections::HashMap;
+use std::mem::size_of;
+
+/// An Array that can represent slots of varying types
+pub struct UnionArray {
+data: ArrayDataRef,
+boxed_fields: Vec,
+}
+
+impl UnionArray {
+/// Creates a new `UnionArray`.
+///
+/// Accepts type ids, child arrays and optionally offsets (for dense 
unions) to create
+/// a new `UnionArray`.  This method makes no attempt to validate the data 
provided by the
+/// caller and assumes that each of the components are correct and 
consistent with each other.
+/// See `try_new` for an alternative that validates the data provided.
+///
+/// # Data Consistency
+///
+/// The `type_ids` `Buffer` should contain `i8` values.  These values 
should be greater than
+/// zero and must be less than the number of children provided in 
`child_arrays`.  These values

Review comment:
   Note that the Union type in C++ and in the IPC format allows you to 
specify logical type ids.
   For example if creating `UnionType({float32, int32}, type_ids=[5, 7])`, then 
a UnionArray's type ids buffer could be something like `[5, 7, 7]`.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Inf

[GitHub] [arrow] github-actions[bot] commented on issue #7010: [Rust] [CI]: fix rustfmt failures

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7010:
URL: https://github.com/apache/arrow/pull/7010#issuecomment-617680858


   
   
   Thanks for opening a pull request!
   
   Could you open an issue for this pull request on JIRA?
   https://issues.apache.org/jira/browse/ARROW
   
   Then could you also rename pull request title in the following format?
   
   ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY}
   
   See also:
   
 * [Other pull requests](https://github.com/apache/arrow/pulls/)
 * [Contribution Guidelines - How to contribute 
patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches)
   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] jianxind commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files

2020-04-22 Thread GitBox



jianxind commented on a change in pull request #6954:
URL: https://github.com/apache/arrow/pull/6954#discussion_r412846457



##
File path: docs/source/developers/benchmarks.rst
##
@@ -59,7 +59,7 @@ Sometimes, it is required to pass custom CMake flags, e.g.
 .. code-block:: shell
 
   export CC=clang-8 CXX=clang++8
-  archery benchmark run --cmake-extras="-DARROW_USE_SIMD=ON"
+  archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE"

Review comment:
   Get it, then ARROW_SIMD_LEVEL=NONE is fine:) 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] nevi-me commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns

2020-04-22 Thread GitBox



nevi-me commented on issue #7009:
URL: https://github.com/apache/arrow/pull/7009#issuecomment-617678597


   > looks like the windows CI is failing with error not related to my change:
   > 
   > ```
   > "error: \'rustfmt.exe\' is not installed for the toolchain 
\'nightly-2019-11-14-x86_64-pc-windows-msvc\'\nTo install
   > ```
   
   This will be fixed by #7010. May you please kindly rebase once it's merged



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] nevi-me opened a new pull request #7010: [Rust] [CI]: fix rustfmt failures

2020-04-22 Thread GitBox



nevi-me opened a new pull request #7010:
URL: https://github.com/apache/arrow/pull/7010


   This adds the `rustfmt` component to the Rust installations in Windows and 
MacOS, and fixes `rustfmt` related CI failures.
   
   @kszucs @paddyhoran 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] nevi-me commented on issue #7004: ARROW-3827: [Rust] Implement UnionArray Updated

2020-04-22 Thread GitBox



nevi-me commented on issue #7004:
URL: https://github.com/apache/arrow/pull/7004#issuecomment-617676072


   > @kszucs it's failing due to `rustfmt` not being installed before testing 
the flight crate, any idea why this would be the case? Sorry, I don't know much 
about GitHub actions yet...
   
   This change fixes it 
(https://github.com/apache/arrow/pull/6306/commits/f0369b72e164edd86d8f9e6a69cf2cbf44bb6d18).
 I don't understand why the same toolchain has previously worked, but now we're 
getting spurious failures in different PRs. I'll open a separate PR to fix it



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] cyb70289 commented on a change in pull request #6954: ARROW-8440: [C++] Refine SIMD header files

2020-04-22 Thread GitBox



cyb70289 commented on a change in pull request #6954:
URL: https://github.com/apache/arrow/pull/6954#discussion_r412836834



##
File path: docs/source/developers/benchmarks.rst
##
@@ -59,7 +59,7 @@ Sometimes, it is required to pass custom CMake flags, e.g.
 .. code-block:: shell
 
   export CC=clang-8 CXX=clang++8
-  archery benchmark run --cmake-extras="-DARROW_USE_SIMD=ON"
+  archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE"

Review comment:
   [The 
context](https://arrow.apache.org/docs/developers/benchmarks.html#running-the-benchmark-suite)
 is to show how to pass extra cmake flags. Default value looks not very useful. 
Maybe change to AVX2?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kou commented on issue #7008: ARROW-8551: [CI][Gandiva] Use LLVM 8 in gandiva linux build

2020-04-22 Thread GitBox



kou commented on issue #7008:
URL: https://github.com/apache/arrow/pull/7008#issuecomment-617618030


   Can we move the Docker image for building Gandiva on Linux to our 
`ci/docker/` like 
https://github.com/apache/arrow/blob/master/python/manylinux201x/Dockerfile-x86_64_base_2014
 ?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on issue #7008: ARROW-8551: [CI][Gandiva] Use LLVM 8 in gandiva linux build

2020-04-22 Thread GitBox



github-actions[bot] commented on issue #7008:
URL: https://github.com/apache/arrow/pull/7008#issuecomment-617616494


   Revision: 1e235ddc11ff6ee4620b62e3b5f9a318d117512b
   
   Submitted crossbow builds: [ursa-labs/crossbow @ 
actions-162](https://github.com/ursa-labs/crossbow/branches/all?query=actions-162)
   
   |Task|Status|
   ||--|
   
|gandiva-jar-xenial|[![TravisCI](https://img.shields.io/travis/ursa-labs/crossbow/actions-162-travis-gandiva-jar-xenial.svg)](https://travis-ci.org/ursa-labs/crossbow/branches)|



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] kou commented on issue #7008: ARROW-8551: [CI][Gandiva] Use LLVM 8 in gandiva linux build

2020-04-22 Thread GitBox



kou commented on issue #7008:
URL: https://github.com/apache/arrow/pull/7008#issuecomment-617615843


   @github-actions crossbow submit gandiva-jar-xenial



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance

2020-04-22 Thread GitBox



tustvold commented on a change in pull request #6980:
URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972



##
File path: rust/arrow/src/array/builder.rs
##
@@ -236,6 +251,14 @@ impl BufferBuilderTrait for 
BufferBuilder {
 self.write_bytes(v.to_byte_slice(), 1)
 }
 
+default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> {
+self.reserve(n)?;
+for _ in 0..n {
+self.write_bytes(v.to_byte_slice(), 1)?;
+}

Review comment:
   That would be incorrect though? The method should write the value `v` 
`n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and 
then increase the recorded element count by n, which would leave things in an 
odd state. 
   
   For the general case there is little reason for this method to exist, aside 
from reserving memory once. It mainly exists because the bitmap specialization 
can more efficiently append the same value multiple times than a naive loop 
such as this.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance

2020-04-22 Thread GitBox



tustvold commented on a change in pull request #6980:
URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972



##
File path: rust/arrow/src/array/builder.rs
##
@@ -236,6 +251,14 @@ impl BufferBuilderTrait for 
BufferBuilder {
 self.write_bytes(v.to_byte_slice(), 1)
 }
 
+default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> {
+self.reserve(n)?;
+for _ in 0..n {
+self.write_bytes(v.to_byte_slice(), 1)?;
+}

Review comment:
   That would be incorrect though? The method should write the value `v` 
`n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and 
then increase the recorded element count by n, which would leave things in an 
odd state. 
   
   For the general case there is little reason for this method to exist, aside 
from reserving memory once. It mainly exists because the bitmap specialization 
can more efficiently append the same value multiple times than a naive loop 
such as this.
   
   Edit: Checked again and you're completely correct, there is a type 
specialization for the buffer writer - will fix





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance

2020-04-22 Thread GitBox



tustvold commented on a change in pull request #6980:
URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972



##
File path: rust/arrow/src/array/builder.rs
##
@@ -236,6 +251,14 @@ impl BufferBuilderTrait for 
BufferBuilder {
 self.write_bytes(v.to_byte_slice(), 1)
 }
 
+default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> {
+self.reserve(n)?;
+for _ in 0..n {
+self.write_bytes(v.to_byte_slice(), 1)?;
+}

Review comment:
   That would be incorrect though? The method should write the value `v` 
`n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and 
then increase the recorded element count by n, which would leave things in an 
odd state. 
   
   For the general case there is little reason for this method to exist, aside 
from reserving memory once. It mainly exists because the bitmap specialization 
can more efficiently append the same value multiple times than a naive loop 
such as this.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance

2020-04-22 Thread GitBox



tustvold commented on a change in pull request #6980:
URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972



##
File path: rust/arrow/src/array/builder.rs
##
@@ -236,6 +251,14 @@ impl BufferBuilderTrait for 
BufferBuilder {
 self.write_bytes(v.to_byte_slice(), 1)
 }
 
+default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> {
+self.reserve(n)?;
+for _ in 0..n {
+self.write_bytes(v.to_byte_slice(), 1)?;
+}

Review comment:
   That would be incorrect though? The method should write the value `v` 
`n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and 
then increase the recorded element count by n, which would leave things in an 
odd state. 
   
   For the general case there is no reason for this method to exist, it purely 
exists because the bitmap specialization can more efficiently append the same 
value multiple times than a naive loop such as this.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance

2020-04-22 Thread GitBox



tustvold commented on a change in pull request #6980:
URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972



##
File path: rust/arrow/src/array/builder.rs
##
@@ -236,6 +251,14 @@ impl BufferBuilderTrait for 
BufferBuilder {
 self.write_bytes(v.to_byte_slice(), 1)
 }
 
+default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> {
+self.reserve(n)?;
+for _ in 0..n {
+self.write_bytes(v.to_byte_slice(), 1)?;
+}

Review comment:
   That would be incorrect though? The method should write the value `v` 
`n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and 
then update the recorded element count by n, which would leave things in an odd 
state.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] tustvold commented on a change in pull request #6980: ARROW-8516: [Rust] Improve PrimitiveBuilder::append_slice performance

2020-04-22 Thread GitBox



tustvold commented on a change in pull request #6980:
URL: https://github.com/apache/arrow/pull/6980#discussion_r412736972



##
File path: rust/arrow/src/array/builder.rs
##
@@ -236,6 +251,14 @@ impl BufferBuilderTrait for 
BufferBuilder {
 self.write_bytes(v.to_byte_slice(), 1)
 }
 
+default fn append_n(&mut self, n: usize, v: T::Native) -> Result<()> {
+self.reserve(n)?;
+for _ in 0..n {
+self.write_bytes(v.to_byte_slice(), 1)?;
+}

Review comment:
   That would be incorrect though? The method should write the value `v` 
`n` times. Calling `write_bytes(v.to_byte_slice(), n)` would write `v` once and 
then update the array length by n, which would leave things in an odd state.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] houqp commented on issue #7009: ARROW-8552: [Rust] support iterate parquet row columns

2020-04-22 Thread GitBox



houqp commented on issue #7009:
URL: https://github.com/apache/arrow/pull/7009#issuecomment-617600824


   looks like the windows CI is failing with error not related to my change:
   
   ```
   "error: \'rustfmt.exe\' is not installed for the toolchain 
\'nightly-2019-11-14-x86_64-pc-windows-msvc\'\nTo install
   ```



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

91 matches

Mail list logo