[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414990082



##
File path: cpp/src/jni/dataset/jni_wrapper.cpp
##
@@ -0,0 +1,577 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "concurrent_map.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/kernels/compare.h"
+#include "jni/dataset/Types.pb.h"
+
+#include "org_apache_arrow_dataset_jni_JniWrapper.h"
+#include "org_apache_arrow_dataset_file_JniWrapper.h"
+
+static jclass illegal_access_exception_class;
+static jclass illegal_argument_exception_class;
+static jclass runtime_exception_class;
+
+static jclass record_batch_handle_class;
+static jclass record_batch_handle_field_class;
+static jclass record_batch_handle_buffer_class;
+
+static jmethodID record_batch_handle_constructor;
+static jmethodID record_batch_handle_field_constructor;
+static jmethodID record_batch_handle_buffer_constructor;
+
+static jint JNI_VERSION = JNI_VERSION_1_6;
+
+using arrow::jni::ConcurrentMap;
+
+static ConcurrentMap> 
dataset_factory_holder_;
+static ConcurrentMap> dataset_holder_;
+static ConcurrentMap> 
scan_task_holder_;
+static ConcurrentMap> scanner_holder_;
+static ConcurrentMap> 
iterator_holder_;
+static ConcurrentMap> buffer_holder_;
+
+#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y)
+
+#define JNI_ASSIGN_OR_THROW_IMPL(t, lhs, rexpr) \
+  auto t = (rexpr); \
+  if (!t.status().ok()) {   \
+env->ThrowNew(runtime_exception_class, t.status().message().c_str());   \
+  } \
+  lhs = std::move(t).ValueOrDie();
+
+#define JNI_ASSIGN_OR_THROW(lhs, rexpr) \
+  JNI_ASSIGN_OR_THROW_IMPL(JNI_ASSIGN_OR_THROW_NAME(_tmp_var, __COUNTER__), 
lhs, rexpr)
+
+#define JNI_ASSERT_OK_OR_THROW(expr)  \
+  do {\
+auto _res = (expr);   \
+arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \
+if (!_st.ok()) {  \
+   env->ThrowNew(runtime_exception_class, _st.message().c_str());  \
+} \
+  } while (false);
+
+jclass CreateGlobalClassReference(JNIEnv* env, const char* class_name) {
+  jclass local_class = env->FindClass(class_name);
+  jclass global_class = (jclass)env->NewGlobalRef(local_class);
+  env->DeleteLocalRef(local_class);
+  return global_class;
+}
+
+jmethodID GetMethodID(JNIEnv* env, jclass this_class, const char* name, const 
char* sig) {
+  jmethodID ret = env->GetMethodID(this_class, name, sig);
+  if (ret == nullptr) {
+std::string error_message = "Unable to find method " + std::string(name) +
+" within signature" + std::string(sig);
+env->ThrowNew(illegal_access_exception_class, error_message.c_str());
+  }
+  return ret;
+}
+
+jint JNI_OnLoad(JavaVM* vm, void* reserved) {
+  JNIEnv* env;
+  if (vm->GetEnv(reinterpret_cast(), JNI_VERSION) != JNI_OK) {
+return JNI_ERR;
+  }
+
+  illegal_access_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/IllegalAccessException;");
+  illegal_argument_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/IllegalArgumentException;");
+  runtime_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/RuntimeException;");
+
+  record_batch_handle_class =
+  CreateGlobalClassReference(env, 
"Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle;");
+  record_batch_handle_field_class =
+  CreateGlobalClassReference(env, 
"Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle$Field;");
+  record_batch_handle_buffer_class =
+  CreateGlobalClassReference(env, 

[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414989937



##
File path: cpp/src/jni/dataset/jni_wrapper.cpp
##
@@ -0,0 +1,577 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "concurrent_map.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/kernels/compare.h"
+#include "jni/dataset/Types.pb.h"
+
+#include "org_apache_arrow_dataset_jni_JniWrapper.h"
+#include "org_apache_arrow_dataset_file_JniWrapper.h"
+
+static jclass illegal_access_exception_class;
+static jclass illegal_argument_exception_class;
+static jclass runtime_exception_class;
+
+static jclass record_batch_handle_class;
+static jclass record_batch_handle_field_class;
+static jclass record_batch_handle_buffer_class;
+
+static jmethodID record_batch_handle_constructor;
+static jmethodID record_batch_handle_field_constructor;
+static jmethodID record_batch_handle_buffer_constructor;
+
+static jint JNI_VERSION = JNI_VERSION_1_6;
+
+using arrow::jni::ConcurrentMap;
+
+static ConcurrentMap> 
dataset_factory_holder_;
+static ConcurrentMap> dataset_holder_;
+static ConcurrentMap> 
scan_task_holder_;
+static ConcurrentMap> scanner_holder_;
+static ConcurrentMap> 
iterator_holder_;
+static ConcurrentMap> buffer_holder_;
+
+#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y)
+
+#define JNI_ASSIGN_OR_THROW_IMPL(t, lhs, rexpr) \
+  auto t = (rexpr); \
+  if (!t.status().ok()) {   \
+env->ThrowNew(runtime_exception_class, t.status().message().c_str());   \
+  } \
+  lhs = std::move(t).ValueOrDie();
+
+#define JNI_ASSIGN_OR_THROW(lhs, rexpr) \
+  JNI_ASSIGN_OR_THROW_IMPL(JNI_ASSIGN_OR_THROW_NAME(_tmp_var, __COUNTER__), 
lhs, rexpr)
+
+#define JNI_ASSERT_OK_OR_THROW(expr)  \
+  do {\
+auto _res = (expr);   \
+arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \
+if (!_st.ok()) {  \
+   env->ThrowNew(runtime_exception_class, _st.message().c_str());  \
+} \
+  } while (false);
+
+jclass CreateGlobalClassReference(JNIEnv* env, const char* class_name) {
+  jclass local_class = env->FindClass(class_name);
+  jclass global_class = (jclass)env->NewGlobalRef(local_class);
+  env->DeleteLocalRef(local_class);
+  return global_class;
+}
+
+jmethodID GetMethodID(JNIEnv* env, jclass this_class, const char* name, const 
char* sig) {
+  jmethodID ret = env->GetMethodID(this_class, name, sig);
+  if (ret == nullptr) {
+std::string error_message = "Unable to find method " + std::string(name) +
+" within signature" + std::string(sig);
+env->ThrowNew(illegal_access_exception_class, error_message.c_str());
+  }
+  return ret;
+}
+
+jint JNI_OnLoad(JavaVM* vm, void* reserved) {
+  JNIEnv* env;
+  if (vm->GetEnv(reinterpret_cast(), JNI_VERSION) != JNI_OK) {
+return JNI_ERR;
+  }
+
+  illegal_access_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/IllegalAccessException;");
+  illegal_argument_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/IllegalArgumentException;");
+  runtime_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/RuntimeException;");
+
+  record_batch_handle_class =
+  CreateGlobalClassReference(env, 
"Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle;");
+  record_batch_handle_field_class =
+  CreateGlobalClassReference(env, 
"Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle$Field;");
+  record_batch_handle_buffer_class =
+  CreateGlobalClassReference(env, 

[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414989258



##
File path: 
java/dataset/src/test/java/org/apache/arrow/dataset/jni/NativeDatasetTest.java
##
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.dataset.jni;
+
+import java.io.File;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import org.apache.arrow.dataset.DatasetTypes;
+import org.apache.arrow.dataset.file.FileFormat;
+import org.apache.arrow.dataset.file.FileSystem;
+import org.apache.arrow.dataset.file.SingleFileDatasetFactory;
+import org.apache.arrow.dataset.filter.Filter;
+import org.apache.arrow.dataset.filter.FilterImpl;
+import org.apache.arrow.dataset.scanner.ScanOptions;
+import org.apache.arrow.dataset.scanner.ScanTask;
+import org.apache.arrow.dataset.scanner.Scanner;
+import org.apache.arrow.dataset.source.Dataset;
+import org.apache.arrow.dataset.source.DatasetFactory;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class NativeDatasetTest {
+
+  private String sampleParquetPath() {
+return NativeDatasetTest.class.getResource(File.separator + 
"userdata.parquet").getPath();

Review comment:
   please don't check the binary file into this repo.  It would be better 
to generate a file on the fly, so readers of the test can understand what data 
exists there.  If a standalone parquet file is really needed 
[arrow-testing](https://github.com/apache/arrow-testing) is the place to put it





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414988831



##
File path: java/pom.xml
##
@@ -369,24 +369,24 @@
   org.apache.maven.plugins
   maven-compiler-plugin
   3.6.2
-  
-
-  -XDcompilePolicy=simple
-  -Xplugin:ErrorProne
-
-
-  
-com.google.errorprone
-error_prone_core
-2.3.3
-  
-  
-org.immutables
-value
-2.8.2
-  
-
-  
+

Review comment:
   revert this?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414988755



##
File path: java/dataset/src/test/java/org/apache/arrow/util/SchemaUtilsTest.java
##
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.util;
+
+import static java.util.Arrays.asList;
+
+import java.io.IOException;
+
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class SchemaUtilsTest {
+
+  private static Field field(String name, boolean nullable, ArrowType type, 
Field... children) {
+return new Field(name, new FieldType(nullable, type, null, null), 
asList(children));
+  }
+
+  @Test
+  public void testSerializationAndDeserialization() throws IOException {
+Schema schema = new Schema(asList(
+field("a", false, new ArrowType.Null()),
+field("b", true, new ArrowType.Utf8()),
+field("c", true, new ArrowType.Binary()))
+);
+
+byte[] serialized = SchemaUtils.get().serialize(schema);
+Schema deserialized = SchemaUtils.get().deserialize(serialized, new 
RootAllocator(Long.MAX_VALUE));
+Assert.assertEquals(schema, deserialized);

Review comment:
   generally, I think we static import assertEquals.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414988474



##
File path: java/dataset/src/main/java/org/apache/arrow/util/SchemaUtils.java
##
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.channels.Channels;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.ipc.ReadChannel;
+import org.apache.arrow.vector.ipc.WriteChannel;
+import org.apache.arrow.vector.ipc.message.MessageChannelReader;
+import org.apache.arrow.vector.ipc.message.MessageResult;
+import org.apache.arrow.vector.ipc.message.MessageSerializer;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
+
+/**
+ * Schema utility class including serialization and deserialization.
+ */
+public class SchemaUtils {
+  private static final SchemaUtils INSTANCE = new SchemaUtils();
+
+  public static SchemaUtils get() {
+return INSTANCE;
+  }
+
+  private SchemaUtils() {
+
+  }
+
+  /**
+   * Deserialize Arrow schema from byte array.
+   */
+  public Schema deserialize(byte[] bytes, BufferAllocator allocator) throws 
IOException {

Review comment:
   why not make these static methods? instead of using a singleton pattern?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414988443



##
File path: java/dataset/src/main/java/org/apache/arrow/util/SchemaUtils.java
##
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.channels.Channels;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.ipc.ReadChannel;
+import org.apache.arrow.vector.ipc.WriteChannel;
+import org.apache.arrow.vector.ipc.message.MessageChannelReader;
+import org.apache.arrow.vector.ipc.message.MessageResult;
+import org.apache.arrow.vector.ipc.message.MessageSerializer;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
+
+/**
+ * Schema utility class including serialization and deserialization.
+ */
+public class SchemaUtils {

Review comment:
   this probably belongs in the vector package.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414988086



##
File path: 
java/dataset/src/main/java/org/apache/arrow/memory/NativeUnderlingMemory.java
##
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory;
+
+import org.apache.arrow.dataset.jni.JniWrapper;
+
+/**
+ * AllocationManager implementation for Native allocated memory.
+ */
+public class NativeUnderlingMemory extends AllocationManager {

Review comment:
   this should probably be in a different common package if we want to wrap 
the C++ allocator.  It isn't clear to me that we should do this because we lose 
java native heap caps IIUC.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414987860



##
File path: 
java/dataset/src/main/java/org/apache/arrow/dataset/source/DatasetFactory.java
##
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.dataset.source;
+
+import org.apache.arrow.vector.types.pojo.Schema;
+
+/**
+ * DataSourceDiscovery provides a way to inspect a DataSource potential
+ * schema before materializing it. Thus, the user can peek the schema for
+ * data sources and decide on a unified schema.
+ */
+public interface DatasetFactory {
+
+  Schema inspect();
+

Review comment:
   javadoc.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414987685



##
File path: 
java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanTask.java
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.dataset.scanner;
+
+import java.util.Iterator;
+
+import org.apache.arrow.vector.VectorSchemaRoot;
+
+/**
+ * Read record batches from a range of a single data fragment. A
+ * ScanTask is meant to be a unit of work to be dispatched. The implementation
+ * must be thread and concurrent safe.
+ */
+public interface ScanTask {
+
+  /**
+   * Creates and returns a {@link Itr} instance.
+   */
+  Itr scan();
+
+  /**
+   * The iterator implementation for {@link VectorSchemaRoot}s.
+   */
+  interface Itr extends Iterator, AutoCloseable {

Review comment:
   can we call this "Iterator"?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414987799



##
File path: 
java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanTask.java
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.dataset.scanner;
+
+import java.util.Iterator;
+
+import org.apache.arrow.vector.VectorSchemaRoot;
+
+/**
+ * Read record batches from a range of a single data fragment. A
+ * ScanTask is meant to be a unit of work to be dispatched. The implementation
+ * must be thread and concurrent safe.
+ */
+public interface ScanTask {
+
+  /**
+   * Creates and returns a {@link Itr} instance.
+   */
+  Itr scan();
+
+  /**
+   * The iterator implementation for {@link VectorSchemaRoot}s.
+   */
+  interface Itr extends Iterator, AutoCloseable {
+// FIXME VectorSchemaRoot is not actually something ITERABLE. Using a 
reader convention instead.

Review comment:
   Fix this?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414987154



##
File path: 
java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java
##
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.dataset.jni;
+
+/**
+ * JNI wrapper for Datasets API's native implementation.
+ */
+public class JniWrapper {
+
+  private static final JniWrapper INSTANCE = new JniWrapper();
+
+  public static JniWrapper get() {
+return INSTANCE;
+  }
+
+  private JniWrapper() {
+JniLoader.get().ensureLoaded();
+  }
+
+  public native void closeDatasetFactory(long datasetFactoryId);

Review comment:
   these methods need javadoc.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414986841



##
File path: 
java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java
##
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.dataset.file;
+
+import org.apache.arrow.dataset.jni.JniLoader;
+
+/**
+ * JniWrapper for filesystem based {@link 
org.apache.arrow.dataset.source.Dataset} implementations.
+ */
+public class JniWrapper {
+

Review comment:
   private constructor?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414986335



##
File path: cpp/src/jni/dataset/proto/Types.proto
##
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+syntax = "proto2";
+package types;
+
+option java_package = "org.apache.arrow.dataset";

Review comment:
   why is this file being introduced?  





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414985230



##
File path: cpp/src/jni/dataset/jni_wrapper.cpp
##
@@ -0,0 +1,577 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "concurrent_map.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/kernels/compare.h"
+#include "jni/dataset/Types.pb.h"
+
+#include "org_apache_arrow_dataset_jni_JniWrapper.h"
+#include "org_apache_arrow_dataset_file_JniWrapper.h"
+
+static jclass illegal_access_exception_class;
+static jclass illegal_argument_exception_class;
+static jclass runtime_exception_class;
+
+static jclass record_batch_handle_class;
+static jclass record_batch_handle_field_class;
+static jclass record_batch_handle_buffer_class;
+
+static jmethodID record_batch_handle_constructor;
+static jmethodID record_batch_handle_field_constructor;
+static jmethodID record_batch_handle_buffer_constructor;
+
+static jint JNI_VERSION = JNI_VERSION_1_6;
+
+using arrow::jni::ConcurrentMap;
+
+static ConcurrentMap> 
dataset_factory_holder_;

Review comment:
   this still strikes me as a bad pattern.
   
   
https://stackoverflow.com/questions/26375215/c-shared-ptr-and-java-native-object-ownership/26377515
 seems better.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414984900



##
File path: cpp/src/jni/dataset/jni_wrapper.cpp
##
@@ -0,0 +1,577 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "concurrent_map.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/kernels/compare.h"
+#include "jni/dataset/Types.pb.h"
+
+#include "org_apache_arrow_dataset_jni_JniWrapper.h"
+#include "org_apache_arrow_dataset_file_JniWrapper.h"
+
+static jclass illegal_access_exception_class;
+static jclass illegal_argument_exception_class;
+static jclass runtime_exception_class;
+
+static jclass record_batch_handle_class;
+static jclass record_batch_handle_field_class;
+static jclass record_batch_handle_buffer_class;
+
+static jmethodID record_batch_handle_constructor;
+static jmethodID record_batch_handle_field_constructor;
+static jmethodID record_batch_handle_buffer_constructor;
+
+static jint JNI_VERSION = JNI_VERSION_1_6;
+
+using arrow::jni::ConcurrentMap;
+
+static ConcurrentMap> 
dataset_factory_holder_;
+static ConcurrentMap> dataset_holder_;
+static ConcurrentMap> 
scan_task_holder_;
+static ConcurrentMap> scanner_holder_;
+static ConcurrentMap> 
iterator_holder_;
+static ConcurrentMap> buffer_holder_;
+
+#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y)
+
+#define JNI_ASSIGN_OR_THROW_IMPL(t, lhs, rexpr) \
+  auto t = (rexpr); \
+  if (!t.status().ok()) {   \
+env->ThrowNew(runtime_exception_class, t.status().message().c_str());   \
+  } \
+  lhs = std::move(t).ValueOrDie();
+
+#define JNI_ASSIGN_OR_THROW(lhs, rexpr) \
+  JNI_ASSIGN_OR_THROW_IMPL(JNI_ASSIGN_OR_THROW_NAME(_tmp_var, __COUNTER__), 
lhs, rexpr)
+
+#define JNI_ASSERT_OK_OR_THROW(expr)  \
+  do {\
+auto _res = (expr);   \
+arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \
+if (!_st.ok()) {  \
+   env->ThrowNew(runtime_exception_class, _st.message().c_str());  \
+} \
+  } while (false);
+
+jclass CreateGlobalClassReference(JNIEnv* env, const char* class_name) {
+  jclass local_class = env->FindClass(class_name);
+  jclass global_class = (jclass)env->NewGlobalRef(local_class);
+  env->DeleteLocalRef(local_class);
+  return global_class;
+}
+
+jmethodID GetMethodID(JNIEnv* env, jclass this_class, const char* name, const 
char* sig) {
+  jmethodID ret = env->GetMethodID(this_class, name, sig);
+  if (ret == nullptr) {
+std::string error_message = "Unable to find method " + std::string(name) +
+" within signature" + std::string(sig);
+env->ThrowNew(illegal_access_exception_class, error_message.c_str());
+  }
+  return ret;
+}
+
+jint JNI_OnLoad(JavaVM* vm, void* reserved) {
+  JNIEnv* env;
+  if (vm->GetEnv(reinterpret_cast(), JNI_VERSION) != JNI_OK) {
+return JNI_ERR;
+  }
+
+  illegal_access_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/IllegalAccessException;");
+  illegal_argument_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/IllegalArgumentException;");
+  runtime_exception_class =
+  CreateGlobalClassReference(env, "Ljava/lang/RuntimeException;");
+
+  record_batch_handle_class =
+  CreateGlobalClassReference(env, 
"Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle;");
+  record_batch_handle_field_class =
+  CreateGlobalClassReference(env, 
"Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle$Field;");
+  record_batch_handle_buffer_class =
+  CreateGlobalClassReference(env, 

[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414984380



##
File path: cpp/src/jni/dataset/jni_wrapper.cpp
##
@@ -0,0 +1,577 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "concurrent_map.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/kernels/compare.h"
+#include "jni/dataset/Types.pb.h"
+
+#include "org_apache_arrow_dataset_jni_JniWrapper.h"
+#include "org_apache_arrow_dataset_file_JniWrapper.h"
+
+static jclass illegal_access_exception_class;
+static jclass illegal_argument_exception_class;
+static jclass runtime_exception_class;
+
+static jclass record_batch_handle_class;
+static jclass record_batch_handle_field_class;
+static jclass record_batch_handle_buffer_class;
+
+static jmethodID record_batch_handle_constructor;
+static jmethodID record_batch_handle_field_constructor;
+static jmethodID record_batch_handle_buffer_constructor;
+
+static jint JNI_VERSION = JNI_VERSION_1_6;
+
+using arrow::jni::ConcurrentMap;
+
+static ConcurrentMap> 
dataset_factory_holder_;
+static ConcurrentMap> dataset_holder_;
+static ConcurrentMap> 
scan_task_holder_;
+static ConcurrentMap> scanner_holder_;
+static ConcurrentMap> 
iterator_holder_;
+static ConcurrentMap> buffer_holder_;
+
+#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y)

Review comment:
   I don't think these macros are necessary.  you should be able to use a 
normal (templated) function since you throw an exception instead of return.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#discussion_r414983499



##
File path: cpp/src/jni/dataset/concurrent_map.h
##
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#ifndef JNI_ID_TO_MODULE_MAP_H
+#define JNI_ID_TO_MODULE_MAP_H
+
+#include 
+#include 
+#include 
+#include 
+
+#include "jni.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {

Review comment:
   Please do not copy and paste this file, factor it out to a common place 
that can be used by all JNI implementations.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #7025: ARROW-2260: [C++][Plasma] Use Gflags for command-line parsing

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7025:
URL: https://github.com/apache/arrow/pull/7025#discussion_r414975521



##
File path: cpp/src/plasma/store.cc
##
@@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string 
plasma_directory, bool hugepages
   g_runner->Start(socket_name, plasma_directory, hugepages_enabled, 
external_store);
 }
 
+void UsageError(const char* error_msg, int exit_code=1) {
+  std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << 
std::endl;
+  exit(exit_code);
+}
+
 }  // namespace plasma
 
+#ifdef __linux__
+#define SHM_DEFAULT_PATH "/dev/shm"
+#else
+#define SHM_DEFAULT_PATH "/tmp"
+#endif
+
+// Command-line flags.
+DEFINE_string(d, SHM_DEFAULT_PATH, "directory where to create the 
memory-backed file");
+DEFINE_string(e, "", "endpoint for external storage service, where objects "
+"evicted from Plasma store can be written to, optional");
+DEFINE_bool(h, false, "whether to enable hugepage support");
+DEFINE_string(s, "", "socket name where the Plasma store will listen for 
requests, required");
+DEFINE_string(m, "", "amount of memory in bytes to use for Plasma store, 
required");
+
 int main(int argc, char* argv[]) {
   ArrowLog::StartArrowLog(argv[0], ArrowLogLevel::ARROW_INFO);
   ArrowLog::InstallFailureSignalHandler();
+
+  gflags::SetUsageMessage("Shared-memory server for Arrow data.\nUsage: ");
+  gflags::SetVersionString("TODO");
+
   char* socket_name = nullptr;
   // Directory where plasma memory mapped files are stored.
   std::string plasma_directory;
   std::string external_store_endpoint;
   bool hugepages_enabled = false;
   int64_t system_memory = -1;
-  int c;
-  while ((c = getopt(argc, argv, "s:m:d:e:h")) != -1) {
-switch (c) {
-  case 'd':
-plasma_directory = std::string(optarg);
-break;
-  case 'e':
-external_store_endpoint = std::string(optarg);
-break;
-  case 'h':
-hugepages_enabled = true;
-break;
-  case 's':
-socket_name = optarg;
-break;
-  case 'm': {
-char extra;
-int scanned = sscanf(optarg, "%" SCNd64 "%c", _memory, );
-ARROW_CHECK(scanned == 1);
-// Set system memory capacity
-
plasma::PlasmaAllocator::SetFootprintLimit(static_cast(system_memory));
-ARROW_LOG(INFO) << "Allowing the Plasma store to use up to "
-<< static_cast(system_memory) / 10
-<< "GB of memory.";
-break;
-  }
-  default:
-exit(-1);
-}
+
+  gflags::ParseCommandLineFlags(, , true);

Review comment:
   please comment literal parameters /\*parameter_name=\*/

##
File path: cpp/src/plasma/store.cc
##
@@ -1304,8 +1320,9 @@ int main(int argc, char* argv[]) {
 plasma::ExternalStores::ExtractStoreName(external_store_endpoint, 
));
 external_store = plasma::ExternalStores::GetStore(name);
 if (external_store == nullptr) {
-  ARROW_LOG(FATAL) << "No such external store \"" << name << "\"";
-  return -1;
+  std::ostringstream error_msg;
+  error_msg << "no such external store \"" << name << "\"";
+  plasma::UsageError(error_msg.str().c_str(), -1);

Review comment:
   please comment literals with parameter name /*parameter_name=*/

##
File path: cpp/src/plasma/store.cc
##
@@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string 
plasma_directory, bool hugepages
   g_runner->Start(socket_name, plasma_directory, hugepages_enabled, 
external_store);
 }
 
+void UsageError(const char* error_msg, int exit_code=1) {
+  std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << 
std::endl;

Review comment:
   might be worth a comment on why this doesn't use ARROW_LOG(FATAL)

##
File path: cpp/src/plasma/store.cc
##
@@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string 
plasma_directory, bool hugepages
   g_runner->Start(socket_name, plasma_directory, hugepages_enabled, 
external_store);
 }
 
+void UsageError(const char* error_msg, int exit_code=1) {
+  std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << 
std::endl;
+  exit(exit_code);
+}
+
 }  // namespace plasma
 
+#ifdef __linux__
+#define SHM_DEFAULT_PATH "/dev/shm"
+#else
+#define SHM_DEFAULT_PATH "/tmp"
+#endif
+
+// Command-line flags.
+DEFINE_string(d, SHM_DEFAULT_PATH, "directory where to create the 
memory-backed file");
+DEFINE_string(e, "", "endpoint for external storage service, where objects "
+"evicted from Plasma store can be written to, optional");
+DEFINE_bool(h, false, "whether to enable hugepage support");
+DEFINE_string(s, "", "socket name where the Plasma store will listen for 
requests, required");
+DEFINE_string(m, "", "amount of memory in bytes to use for Plasma store, 
required");
+
 int main(int argc, char* argv[]) {
   ArrowLog::StartArrowLog(argv[0], ArrowLogLevel::ARROW_INFO);
   

[GitHub] [arrow] emkornfield commented on a change in pull request #7025: ARROW-2260: [C++][Plasma] Use Gflags for command-line parsing

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #7025:
URL: https://github.com/apache/arrow/pull/7025#discussion_r414974208



##
File path: cpp/src/plasma/store.cc
##
@@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string 
plasma_directory, bool hugepages
   g_runner->Start(socket_name, plasma_directory, hugepages_enabled, 
external_store);
 }
 
+void UsageError(const char* error_msg, int exit_code=1) {
+  std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << 
std::endl;
+  exit(exit_code);
+}
+
 }  // namespace plasma
 
+#ifdef __linux__
+#define SHM_DEFAULT_PATH "/dev/shm"
+#else
+#define SHM_DEFAULT_PATH "/tmp"
+#endif
+
+// Command-line flags.
+DEFINE_string(d, SHM_DEFAULT_PATH, "directory where to create the 
memory-backed file");
+DEFINE_string(e, "", "endpoint for external storage service, where objects "
+"evicted from Plasma store can be written to, optional");
+DEFINE_bool(h, false, "whether to enable hugepage support");
+DEFINE_string(s, "", "socket name where the Plasma store will listen for 
requests, required");
+DEFINE_string(m, "", "amount of memory in bytes to use for Plasma store, 
required");
+
 int main(int argc, char* argv[]) {
   ArrowLog::StartArrowLog(argv[0], ArrowLogLevel::ARROW_INFO);
   ArrowLog::InstallFailureSignalHandler();
+
+  gflags::SetUsageMessage("Shared-memory server for Arrow data.\nUsage: ");
+  gflags::SetVersionString("TODO");

Review comment:
   it seems like this could be done by adding a a define using 
ARROW_VERSION cmake variable.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on pull request #7025: ARROW-2260: [C++][Plasma] Use Gflags for command-line parsing

2020-04-24 Thread GitBox


emkornfield commented on pull request #7025:
URL: https://github.com/apache/arrow/pull/7025#issuecomment-619317374


   @chrish42 Thank you for the PR, I'll take a look now.  Note it looks like 
lint is failing due to formatting issues.  You need to run "make format" or 
"ninja format" to run clang-format (we are currently using version 8 I believe).



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] emkornfield commented on a change in pull request #6912: ARROW-8020: [Java] Implement vector validate functionality

2020-04-24 Thread GitBox


emkornfield commented on a change in pull request #6912:
URL: https://github.com/apache/arrow/pull/6912#discussion_r414972130



##
File path: java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
##
@@ -283,4 +283,10 @@
* @return the name of the vector.
*/
   String getName();
+
+  /**
+   * Validate the vector, will throw exception if validate fail.
+   */
+  void validate();

Review comment:
   Instead of adding this directly to ValueVector, can we make a static 
utility method that does validation and excepts a ValueVector?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader

2020-04-24 Thread GitBox


wjones1 commented on a change in pull request #6979:
URL: https://github.com/apache/arrow/pull/6979#discussion_r414951095



##
File path: python/pyarrow/_parquet.pyx
##
@@ -1083,6 +1084,50 @@ cdef class ParquetReader:
 def set_use_threads(self, bint use_threads):
 self.reader.get().set_use_threads(use_threads)
 
+def iter_batches(self, int64_t batch_size, row_groups, column_indices=None,

Review comment:
   I have those parameters because I was following the pattern set by the 
[`read_row_group()` and 
`read_row_groups()`](https://github.com/apache/arrow/blob/66a18e14f64359b73bd8ed5625c3092cd369af44/python/pyarrow/_parquet.pyx#L1131-L1136)
 methods in the same file, which both have a `use_threads` parameters. Same 
with the `read_all()` method.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader

2020-04-24 Thread GitBox


wjones1 commented on a change in pull request #6979:
URL: https://github.com/apache/arrow/pull/6979#discussion_r414948745



##
File path: cpp/src/parquet/arrow/reader.cc
##
@@ -260,12 +260,28 @@ class FileReaderImpl : public FileReader {
 
   Status GetRecordBatchReader(const std::vector& row_group_indices,
   const std::vector& column_indices,
-  std::unique_ptr* out) 
override;
+  std::unique_ptr* out,

Review comment:
   Thanks for the feedback. I think I understand now the pattern going on 
with `set_use_threads()`. It's a method on the reader that [calls the method of 
the same name it's 
`ArrowReaderProperties`](https://github.com/apache/arrow/blob/master/cpp/src/parquet/arrow/reader.cc#L279-L281).
   
   I will implement the same thing for [the 
`set_batch_size()`](https://github.com/apache/arrow/blob/master/cpp/src/parquet/properties.h#L564)
 method.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader

2020-04-24 Thread GitBox


wjones1 commented on a change in pull request #6979:
URL: https://github.com/apache/arrow/pull/6979#discussion_r414948789



##
File path: python/pyarrow/_parquet.pxd
##
@@ -334,7 +334,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" 
nogil:
 ArrowReaderProperties()
 void set_read_dictionary(int column_index, c_bool read_dict)
 c_bool read_dictionary()
-void set_batch_size()
+void set_batch_size(int batch_size)

Review comment:
    





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] zgramana commented on pull request #7032: ARROW-6603, ARROW-5708, ARROW-5634: [C#] Adds ArrayBuilder API to support writing null values + BooleanArray null support

2020-04-24 Thread GitBox


zgramana commented on pull request #7032:
URL: https://github.com/apache/arrow/pull/7032#issuecomment-619296269


   @eerhardt apologies for loading up three issue in the title, but I kept 
finding older issues in the Apache Jira backlog that were addressed here as 
well, and so erred on the side of too much rather than too little.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] markhildreth edited a comment on pull request #7024: ARROW-8573: [Rust] Upgrade Rust to 1.44 nightly

2020-04-24 Thread GitBox


markhildreth edited a comment on pull request #7024:
URL: https://github.com/apache/arrow/pull/7024#issuecomment-619273713


   @andygrove I think there is going to be more to this than this PR. The 
"nightly-2019-11-14" string [can be found in a few 
places](https://github.com/apache/arrow/search?q=nightly-2019-11-14_q=nightly-2019-11-14),
 including some of the build scripts (notice that the workflows run for this PR 
are still called "2019-11-14").
   
   @nevi-me @paddyhoran The problem with this error seems to be that the change 
was only made in rust-toolchain. As a result, the GHA workflows will still 
build a Rust environment with a nightly-2019-11-14 toolchain. When cargo 
attempts to build, it will notice that the toolchain is out of date with the 
`rust-toolchain` file, and attempt to pull the new toolchain on-the-fly. 
However, it doesn't necessarily pull `rustfmt` when this occurs.
   
   ```
   installed toolchains
   
   
   stable-x86_64-pc-windows-msvc
   nightly-2019-11-14-x86_64-pc-windows-msvc
   
   active toolchain
   
   
   nightly-2019-11-14-x86_64-pc-windows-msvc (directory override for 
'\\?\D:\a\arrow\arrow')
   rustc 1.41.0-nightly (ded5ee001 2019-11-13)
   
   + mkdir -p /d/a/arrow/arrow/build/rust
   + pushd /d/a/arrow/arrow/rust
   /d/a/arrow/arrow/rust /d/a/arrow/arrow
   + RUSTFLAGS='-D warnings'
   + cargo build --all-targets
   info: syncing channel updates for 'nightly-2020-04-22-x86_64-pc-windows-msvc'
   info: latest update on 2020-04-22, rust version 1.44.0-nightly (45d050cde 
2020-04-21)
   info: downloading component 'cargo'
   info: downloading component 'rust-std'
   info: downloading component 'rustc'
   info: installing component 'cargo'
   info: installing component 'rust-std'
   info: installing component 'rustc'
   ```



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] mayuropensource edited a comment on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics

2020-04-24 Thread GitBox


mayuropensource edited a comment on pull request #7022:
URL: https://github.com/apache/arrow/pull/7022#issuecomment-619276182


   // SOME_S3_DATA_URI should point to a file (over http) that is ~500 MiB.
   // TTFB_sec is the time-to-first-byte in seconds as measured by curl
   // Bandwidth_bytes_per_sec is download bandwidth (in bytes per second) as 
measured by curl
   
   // Once SOME_S3_DATA_URI  is created, it can be queried on a regular basis 
for measurements.
   
   `S3_DATA_URI="SOME_S3_DATA_URI"; `
   
   `curl --negotiate -u: -o /dev/null -w "connect_time_sec=%{time_connect} 
total_time_sec=%{time_total} data_size_bytes=%{size_download} 
TTFB_sec=%{time_starttransfer} Bandwidth_bytes_per_sec=%{speed_download}\n" 
$S3_DATA_URI`



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] mayuropensource commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics

2020-04-24 Thread GitBox


mayuropensource commented on pull request #7022:
URL: https://github.com/apache/arrow/pull/7022#issuecomment-619276182


   // SOME_S3_DATA_URI should point to a file (over http) that is ~500 MiB.
   // TTFB_sec is the time-to-first-byte in seconds as measured by curl
   // Bandwidth_bytes_per_sec is download bandwidth (in bytes per second) as 
measured by curl
   
   `S3_DATA_URI="SOME_S3_DATA_URI"; `
   
   `curl --negotiate -u: -o /dev/null -w "connect_time_sec=%{time_connect} 
total_time_sec=%{time_total} data_size_bytes=%{size_download} 
TTFB_sec=%{time_starttransfer} Bandwidth_bytes_per_sec=%{speed_download}\n" 
$S3_DATA_URI`



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] markhildreth commented on pull request #7024: ARROW-8573: [Rust] Upgrade Rust to 1.44 nightly

2020-04-24 Thread GitBox


markhildreth commented on pull request #7024:
URL: https://github.com/apache/arrow/pull/7024#issuecomment-619273713


   @andygrove I think there is going to be more to this than this PR. The 
"nightly-2019-11-14" string [can be found in a few 
places](https://github.com/apache/arrow/search?q=nightly-2019-11-14_q=nightly-2019-11-14),
 including some of the build scripts (notice that the workflows run for this PR 
are still called "2019-11-14").
   
   @nevi-me @paddyhoran The problem with this error seems to be that the change 
was only made in rust-toolchain. As a result, the GHA workflows will still 
build a Windows environment with a nightly-2019-11-14 toolchain. When cargo 
attempts to build, it will notice that the toolchain is out of date with the 
`rust-toolchain` file, and attempt to pull the new toolchain on-the-fly. 
However, it doesn't pull `rust fmt` when this occurs.
   
   ```
   installed toolchains
   
   
   stable-x86_64-pc-windows-msvc
   nightly-2019-11-14-x86_64-pc-windows-msvc
   
   active toolchain
   
   
   nightly-2019-11-14-x86_64-pc-windows-msvc (directory override for 
'\\?\D:\a\arrow\arrow')
   rustc 1.41.0-nightly (ded5ee001 2019-11-13)
   
   + mkdir -p /d/a/arrow/arrow/build/rust
   + pushd /d/a/arrow/arrow/rust
   /d/a/arrow/arrow/rust /d/a/arrow/arrow
   + RUSTFLAGS='-D warnings'
   + cargo build --all-targets
   info: syncing channel updates for 'nightly-2020-04-22-x86_64-pc-windows-msvc'
   info: latest update on 2020-04-22, rust version 1.44.0-nightly (45d050cde 
2020-04-21)
   info: downloading component 'cargo'
   info: downloading component 'rust-std'
   info: downloading component 'rustc'
   info: installing component 'cargo'
   info: installing component 'rust-std'
   info: installing component 'rustc'
   ```



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] velvia commented on a change in pull request #4815: [DISCUSS] Add strawman proposal for sparseness and data integrity

2020-04-24 Thread GitBox


velvia commented on a change in pull request #4815:
URL: https://github.com/apache/arrow/pull/4815#discussion_r414877852



##
File path: format/Message.fbs
##
@@ -21,10 +21,69 @@ include "Tensor.fbs";
 
 namespace org.apache.arrow.flatbuf;
 
+/// --
+/// Buffer encoding schemes.
+/// ---
+
+/// Encoding for buffers representing integer as offsets from a reference 
value.
+/// This encoding uses less bits then the logical type indicates.
+/// It saves space when all values in the buffer can be represented with a
+/// small bit width (e.g. if all values in an int64 column are between -128
+/// and 127, then a bit-width of 8 can be be used) offset from the
+/// reference value.
+table FrameOfReferenceIntEncoding {
+  /// The value that all values in the buffer are relative to.
+  reference_value: long = 0;

Review comment:
   Depending on the size of your batch, a sloped representation would 
result in far smaller arrays, since the delta from a slope is typically much 
smaller and can fit into less bits.  You can still do O(1) access to any 
element, just compute ax + b etc.   Assuming the data is actually increasing, 
of course - otherwise step wise is fine.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me commented on a change in pull request #7036: ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray

2020-04-24 Thread GitBox


nevi-me commented on a change in pull request #7036:
URL: https://github.com/apache/arrow/pull/7036#discussion_r414875192



##
File path: rust/arrow/src/array/array.rs
##
@@ -1786,38 +1786,34 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer, usize)> for 
StructArray {
 /// This is mostly used to represent strings or a limited set of primitive 
types as integers,
 /// for example when doing NLP analysis or representing chromosomes by name.
 ///
-/// Example with nullable data:
+/// Example **with nullable** data:
 ///
 /// ```
-/// use arrow::array::DictionaryArray;
-/// use arrow::datatypes::Int8Type;
-/// let test = vec!["a", "a", "b", "c"];
-/// let array : DictionaryArray = test.iter().map(|| if x == 
"b" {None} else {Some(x)}).collect();
-/// assert_eq!(array.keys().collect::>>(), vec![Some(0), 
Some(0), None, Some(1)]);
+/// use arrow::array::DictionaryArray;
+/// use arrow::datatypes::Int8Type;
+/// let test = vec!["a", "a", "b", "c"];
+/// let array : DictionaryArray = test.iter().map(|| if x == "b" 
{None} else {Some(x)}).collect();
+/// assert_eq!(array.keys().collect::>>(), vec![Some(0), 
Some(0), None, Some(1)]);
 /// ```
 ///
-/// Example without nullable data:
+/// Example **without nullable** data:
 ///
 /// ```
-///
-/// use arrow::array::DictionaryArray;
-/// use arrow::datatypes::Int8Type;
-/// let test = vec!["a", "a", "b", "c"];
-/// let array : DictionaryArray = test.into_iter().collect();
-/// assert_eq!(array.keys().collect::>>(), vec![Some(0), 
Some(0), Some(1), Some(2)]);
+/// use arrow::array::DictionaryArray;
+/// use arrow::datatypes::Int8Type;
+/// let test = vec!["a", "a", "b", "c"];
+/// let array : DictionaryArray = test.into_iter().collect();
+/// assert_eq!(array.keys().collect::>>(), vec![Some(0), 
Some(0), Some(1), Some(2)]);
 /// ```
 pub struct DictionaryArray {
-// Array of keys, much like a PrimitiveArray
+/// Array of keys, much like a PrimitiveArray
 data: ArrayDataRef,
 
-// Pointer to the key values.
+/// Pointer to the key values.
 raw_values: RawPtrBox,
 
-// Array of any values.
+/// Array of any values.
 values: ArrayRef,
-

Review comment:
   `is_ordered` is needed for IPC purposes, so we shouldn't remove it for 
compatibility purposes





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7037: ARROW-6718: [Rust] Remove packed_simd

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7037:
URL: https://github.com/apache/arrow/pull/7037#issuecomment-619242490


   https://issues.apache.org/jira/browse/ARROW-6718



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me opened a new pull request #7037: ARROW-6718: [Rust] Remove packed_simd

2020-04-24 Thread GitBox


nevi-me opened a new pull request #7037:
URL: https://github.com/apache/arrow/pull/7037


   This removes the dependency on packed_simd. I initially thought that boolean 
kernels were slower than with explicit SIMD, but this was a false alarm as the 
benchmarks weren't comparing SIMD vs non-SIMD.
   
   While doing this, I noticed that the `divide` kernel appears to be unsound, 
as it checks if a null is 0 (which can be true when the default data behind the 
bitmask is 0).
   
   Below is the performance comparison:
   
   
   From 0.15.0 to 0.16.0
   
   ```rust
Running target/release/deps/arithmetic_kernels-ba6ab3db9f184b40
   add 512 time:   [15.565 us 15.623 us 15.694 us]
   change: [-66.359% -66.104% -65.861%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 4 outliers among 100 measurements (4.00%)
 3 (3.00%) high mild
 1 (1.00%) high severe
   
   add 512 simdtime:   [14.939 us 16.768 us 18.744 us]
   change: [+1.4006% +6.0795% +11.131%] (p = 0.02 < 
0.05)
   Performance has regressed.
   Found 9 outliers among 100 measurements (9.00%)
 1 (1.00%) high mild
 8 (8.00%) high severe
   
   subtract 512time:   [15.659 us 15.727 us 15.799 us]
   change: [-65.994% -65.847% -65.690%] (p = 0.00 < 
0.05)
   Performance has improved.
   
   subtract 512 simd   time:   [14.003 us 14.119 us 14.284 us]
   change: [-4.9276% -3.2446% -1.6479%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 5 outliers among 100 measurements (5.00%)
 2 (2.00%) high mild
 3 (3.00%) high severe
   
   multiply 512time:   [15.774 us 15.824 us 15.875 us]
   change: [-65.694% -65.526% -65.352%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 1 outliers among 100 measurements (1.00%)
 1 (1.00%) high mild
   
   multiply 512 simd   time:   [14.299 us 14.458 us 14.681 us]
   change: [-0.9771% -0.0444% +0.9882%] (p = 0.93 > 
0.05)
   No change in performance detected.
   Found 5 outliers among 100 measurements (5.00%)
 2 (2.00%) high mild
 3 (3.00%) high severe
   
   divide 512  time:   [16.690 us 16.731 us 16.774 us]
   change: [-65.394% -65.012% -64.701%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 3 outliers among 100 measurements (3.00%)
 3 (3.00%) high mild
   
   divide 512 simd time:   [16.098 us 16.147 us 16.202 us]
   change: [-3.6005% -2.6939% -1.9439%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 3 outliers among 100 measurements (3.00%)
 2 (2.00%) high mild
 1 (1.00%) high severe
   
   sum 512 no simd time:   [7.1888 us 7.2836 us 7.4349 us]
   change: [-1.2993% -0.2501% +1.2521%] (p = 0.73 > 
0.05)
   No change in performance detected.
   Found 6 outliers among 100 measurements (6.00%)
 3 (3.00%) high mild
 3 (3.00%) high severe
   
   limit 512, 256 no simd  time:   [6.8801 us 6.9257 us 6.9792 us]
   change: [-3.8909% -2.7450% -1.6742%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 9 outliers among 100 measurements (9.00%)
 2 (2.00%) high mild
 7 (7.00%) high severe
   
   limit 512, 512 no simd  time:   [6.8552 us 6.9007 us 6.9552 us]
   change: [-36.783% -31.294% -25.031%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 7 outliers among 100 measurements (7.00%)
 4 (4.00%) high mild
 3 (3.00%) high severe
   
Running target/release/deps/array_from_vec-9acb1269f64e7733
   array_from_vec 128  time:   [418.62 ns 423.66 ns 430.30 ns]
   change: [-2.2547% -0.6846% +0.9641%] (p = 0.48 > 
0.05)
   No change in performance detected.
   Found 8 outliers among 100 measurements (8.00%)
 3 (3.00%) high mild
 5 (5.00%) high severe
   
   array_from_vec 256  time:   [659.91 ns 661.68 ns 663.62 ns]
   change: [-2.1474% -1.6329% -1.1820%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 3 outliers among 100 measurements (3.00%)
 2 (2.00%) high mild
 1 (1.00%) high severe
   
   array_from_vec 512  time:   [1.1200 us 1.1244 us 1.1304 us]
   change: [-2.9911% -2.3466% -1.7654%] (p = 0.00 < 
0.05)
   Performance has improved.
   Found 2 outliers among 100 measurements (2.00%)
 1 (1.00%) high mild
 1 (1.00%) high severe
   
Running target/release/deps/boolean_kernels-25e7d12fe4fd7f63
   and 

[GitHub] [arrow] bkietz commented on a change in pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat

2020-04-24 Thread GitBox


bkietz commented on a change in pull request #7033:
URL: https://github.com/apache/arrow/pull/7033#discussion_r414856596



##
File path: cpp/src/arrow/dataset/file_csv.h
##
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include 
+#include 
+
+#include "arrow/csv/options.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \brief A FileFormat implementation that reads from and writes to Csv files
+class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
+ public:
+  csv::ParseOptions parse_options = csv::ParseOptions::Defaults();

Review comment:
   fmt->parse_options.delimiter = '\t'
   
   For example





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nealrichardson commented on a change in pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat

2020-04-24 Thread GitBox


nealrichardson commented on a change in pull request #7033:
URL: https://github.com/apache/arrow/pull/7033#discussion_r414852277



##
File path: cpp/src/arrow/dataset/file_csv.h
##
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include 
+#include 
+
+#include "arrow/csv/options.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \brief A FileFormat implementation that reads from and writes to Csv files
+class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
+ public:
+  csv::ParseOptions parse_options = csv::ParseOptions::Defaults();

Review comment:
   How do I provide these options? `auto fmt = 
std::make_shared();` and then like `fmt.parse_options = 
something`?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7036: ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7036:
URL: https://github.com/apache/arrow/pull/7036#issuecomment-619219685


   https://issues.apache.org/jira/browse/ARROW-8591



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7035: ARROW-8590: [Rust] Use arrow crate pretty util in DataFusion

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7035:
URL: https://github.com/apache/arrow/pull/7035#issuecomment-619219686


   https://issues.apache.org/jira/browse/ARROW-8590



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] vertexclique opened a new pull request #7036: ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray

2020-04-24 Thread GitBox


vertexclique opened a new pull request #7036:
URL: https://github.com/apache/arrow/pull/7036


   This PR enables reverse lookup for already built dict.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] markhildreth commented on pull request #6972: ARROW-8287: [Rust] Add "pretty" util to help with printing tabular output of RecordBatches

2020-04-24 Thread GitBox


markhildreth commented on pull request #6972:
URL: https://github.com/apache/arrow/pull/6972#issuecomment-619215502


   Created [follow-up JIRA 
task](https://issues.apache.org/jira/browse/ARROW-8590).



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] markhildreth opened a new pull request #7035: ARROW-8590: [Rust] Use arrow crate pretty util in DataFusion

2020-04-24 Thread GitBox


markhildreth opened a new pull request #7035:
URL: https://github.com/apache/arrow/pull/7035


   Fixes [ARROW-8590](https://issues.apache.org/jira/browse/ARROW-8590)
   
   This builds on #6972, and thus should be merged after that PR is merged.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nealrichardson commented on pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat

2020-04-24 Thread GitBox


nealrichardson commented on pull request #7033:
URL: https://github.com/apache/arrow/pull/7033#issuecomment-619213038


   @github-actions rebase



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nealrichardson commented on pull request #6879: ARROW-8377: [CI][C++][R] Build and run C++ tests on Rtools build

2020-04-24 Thread GitBox


nealrichardson commented on pull request #6879:
URL: https://github.com/apache/arrow/pull/6879#issuecomment-619212781


   @github-actions rebase



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nealrichardson commented on issue #7034: R arrow package can't see arrow-cpp installation

2020-04-24 Thread GitBox


nealrichardson commented on issue #7034:
URL: https://github.com/apache/arrow/issues/7034#issuecomment-619210756


   We don't do any testing on NixOS, so it's not surprising that it doesn't 
just work.
   
   http://arrow.apache.org/docs/r/articles/install.html describes how 
dependencies are resolved and notes the relevant files inside the `r` directory 
of apache/arrow. It sounds like 
http://arrow.apache.org/docs/r/articles/install.html#using-system-libraries is 
most relevant since you say you already have Arrow C++ installed, so maybe you 
can set one of those environment variables to help it find them.
   
   If you have any fixes to enable it to work on your OS, please [open a JIRA 
ticket](https://issues.apache.org/jira/browse/ARROW) and I'll be happy to 
review your PR.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] Zhen-hao opened a new issue #7034: R arrow package can't see arrow-cpp installation

2020-04-24 Thread GitBox


Zhen-hao opened a new issue #7034:
URL: https://github.com/apache/arrow/issues/7034


   Hi there,
   
   this is more a question than a bug request.
   
   I am using NixOS 20.03 and couldn't get the arrow library in R to see the 
arrow C++ library. 
   Even when I install the library from R itself, it can't find the C++ library 
after restarting. 
   The following is the installation log in R. This problem might be specific 
to NixOS. But if anyone can point to the source code that checks for the C++ 
library, I might be able to help to fix this for NixOS.
   Thanks in advance! 
   
   ```
   > arrow::install_arrow()
   Installing package into ‘/home/zhen/R/x86_64-pc-linux-gnu-library/3.6’
   (as ‘lib’ is unspecified)
   trying URL 'https://cran.rstudio.com/src/contrib/arrow_0.17.0.tar.gz'
   Content type 'application/x-gzip' length 242534 bytes (236 KB)
   ==
   downloaded 236 KB
   
   * installing *source* package ‘arrow’ ...
   ** package ‘arrow’ successfully unpacked and MD5 sums checked
   ** using staged installation
   *** No C++ binaries found for nixos-20
   *** Successfully retrieved C++ source
   *** Building C++ libraries
    cmake
    arrow  
   ./configure: line 132: cd: libarrow/arrow-0.17.0/lib: No such file or 
directory
   - NOTE ---
   After installation, please run arrow::install_arrow()
   for help installing required runtime libraries
   -
   ** libs
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c array.cpp -o array.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c array_from_vector.cpp -o array_from_vector.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c array_to_vector.cpp -o array_to_vector.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c arraydata.cpp -o arraydata.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c arrowExports.cpp -o arrowExports.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c buffer.cpp -o buffer.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c chunkedarray.cpp -o chunkedarray.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c compression.cpp -o compression.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c compute.cpp -o compute.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c csv.cpp -o csv.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 
-I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG  
-I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include"
   -fpic  -g -O2  -c dataset.cpp -o dataset.o
   /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ 
-std=gnu++11 

[GitHub] [arrow] andygrove commented on pull request #7018: ARROW-8536: [Rust] [Flight] Check in proto file, conditional build if file exists

2020-04-24 Thread GitBox


andygrove commented on pull request #7018:
URL: https://github.com/apache/arrow/pull/7018#issuecomment-619202633


   Let's see what others say on this. Personally, I think it would be better 
for build.rs to automatically prepend the ASF license header because there is 
the risk of someone making manual changes to the file as part of a PR. 
   
   I know it is extra work, but it seems safer.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7033:
URL: https://github.com/apache/arrow/pull/7033#issuecomment-619195315


   https://issues.apache.org/jira/browse/ARROW-7759



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] bkietz opened a new pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat

2020-04-24 Thread GitBox


bkietz opened a new pull request #7033:
URL: https://github.com/apache/arrow/pull/7033


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me commented on pull request #7024: ARROW-8573: [Rust] Upgrade Rust to 1.44 nightly

2020-04-24 Thread GitBox


nevi-me commented on pull request #7024:
URL: https://github.com/apache/arrow/pull/7024#issuecomment-619189091


   @paddyhoran we might have to try a different nightly, as sometimes a day's 
version might have no rustfmt. The change I made in that PR installs a nightly 
version, I don't know how to make GHA install a stable version of rustfmt



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] fsaintjacques commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics

2020-04-24 Thread GitBox


fsaintjacques commented on pull request #7022:
URL: https://github.com/apache/arrow/pull/7022#issuecomment-619186142


   I'd say just plain HTTP, as @lidavidm pointed in his comment, this is a 
network attribute.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] mayuropensource commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics

2020-04-24 Thread GitBox


mayuropensource commented on pull request #7022:
URL: https://github.com/apache/arrow/pull/7022#issuecomment-619184138


   @fsaintjacques, I can try to put together a python script using boto to 
determine the S3 metrics. Will that work for you?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7032: ARROW-6603: [C#] Adds ArrayBuilder API to support writing null values

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7032:
URL: https://github.com/apache/arrow/pull/7032#issuecomment-619164076


   https://issues.apache.org/jira/browse/ARROW-6603



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] zgramana commented on pull request #6121: ARROW-6603: [C#] - Nullable Array Support

2020-04-24 Thread GitBox


zgramana commented on pull request #6121:
URL: https://github.com/apache/arrow/pull/6121#issuecomment-619162558


   @eerhardt I've just submitted https://github.com/apache/arrow/pull/7032 for 
review/discussion



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] zgramana opened a new pull request #7032: ARROW-6603: [C#] Adds ArrayBuilder API to support writing null values

2020-04-24 Thread GitBox


zgramana opened a new pull request #7032:
URL: https://github.com/apache/arrow/pull/7032


   Takes an alternative approach to completing 
[ARROW-6603](https://issues.apache.org/jira/browse/ARROW-6603) that is in-line 
with the current API and  with other Arrow implementations. More specifically, 
this PR finishes the previously stubbed out implementation already in the 
codebase (e.g. uses the existing `NullBitmapBuffer`).
   
   The biggest challenge was finding a mechanism capable of supporting both 
nullable and non-nullable types with minimal changes to existing code. This was 
accomplished with the addition of the following builder interface member:
   
   ```csharp
   public interface IArrowArrayBuilder : 
IArrowArrayBuilder
   where TArray : IArrowArray
   where TBuilder : IArrowArrayBuilder
   {
   ...
   TBuilder AppendNull();
   ...
   }
   ```
   
   The bulk of the implementation work focuses on adding the implementation of 
`AppendNull` to `PrimitiveArrayBuilder` and `Binary.BuilderBase`, and removing 
hardcoded `0`'s passed as arguments to `nullCount` in the `ArrayData` 
constructors.
   
   This adds two new tests to `ArrayBuilderTests`:
   
   * The first includes a number of `null` scenarios using 
`TestArrayBuilder(...)`.
   * The second includes a `StringArray.Builder` scenario with mixed `null` and 
`string.Empty` values. I implemented it such that `string.Empty` is considered 
a valid value (so it increments `Offset` without adding any bytes to 
`ValueBuffer`. When `null` is passed to `Append` it will just invoke 
`AppendNull` internally instead.
   
   And adds two new tests to `ArrowArrayTests` which focus on the behavior of 
`Slice`:
   
   * `SlicePrimitiveArrayWithNulls`
   * `SliceStringArrayWithNullsAndEmptyStrings`
   
   All 162 (existing + new) tests passed locally at the time this was submitted



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7031: ARROW-8587: [C++] Fix linking Flight benchmarks

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7031:
URL: https://github.com/apache/arrow/pull/7031#issuecomment-619156566


   https://issues.apache.org/jira/browse/ARROW-8587



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou opened a new pull request #7031: ARROW-8587: [C++] Fix linking Flight benchmarks

2020-04-24 Thread GitBox


pitrou opened a new pull request #7031:
URL: https://github.com/apache/arrow/pull/7031


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] sunchao commented on a change in pull request #6935: ARROW-8455: [Rust] Parquet Arrow column read on partially compatible files

2020-04-24 Thread GitBox


sunchao commented on a change in pull request #6935:
URL: https://github.com/apache/arrow/pull/6935#discussion_r414738189



##
File path: rust/parquet/src/column/reader.rs
##
@@ -190,15 +190,12 @@ impl ColumnReaderImpl {
 (self.num_buffered_values - self.num_decoded_values) as 
usize,
 );
 
-// Adjust batch size by taking into account how much space is 
left in
-// values slice or levels slices (if available)
-adjusted_size = min(adjusted_size, values.len() - values_read);
-if let Some(ref levels) = def_levels {
-adjusted_size = min(adjusted_size, levels.len() - 
levels_read);
-}
-if let Some(ref levels) = rep_levels {
-adjusted_size = min(adjusted_size, levels.len() - 
levels_read);
-}
+// Adjust batch size by taking into account how much data there
+// to read. As batch_size is also smaller than value and level
+// slices (if available), this ensures that available space is 
not
+// exceeded.
+adjusted_size = min(adjusted_size, batch_size - values_read);

Review comment:
   Yes. I just struggled to come up with a concrete example for this (i.e., 
with what values for `batch_size`, `def_levels.len()`, `rep_levels.len()` and 
`values.len()` this will occur). :)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kiszk commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


kiszk commented on a change in pull request #7029:
URL: https://github.com/apache/arrow/pull/7029#discussion_r414739332



##
File path: cpp/src/arrow/util/spaced.h
##
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/bit_util.h"
+
+#ifdef ARROW_HAVE_AVX512
+#include 
+#endif  // ARROW_HAVE_AVX512
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+template 
+int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits,
+int64_t valid_bits_offset, T* output) {
+  int num_valid_values = 0;
+  arrow::internal::BitmapReader valid_bits_reader(valid_bits, 
valid_bits_offset,
+  num_values);
+  for (int32_t i = 0; i < num_values; i++) {
+if (valid_bits_reader.IsSet()) {
+  output[num_valid_values++] = values[i];
+}
+valid_bits_reader.Next();
+  }
+  return num_valid_values;
+}
+
+template 
+int DecodeSpacedScalar(T* buffer, int num_values, int null_count,
+   const uint8_t* valid_bits, int64_t valid_bits_offset) {
+  const int values_read = num_values - null_count;
+
+  // Depending on the number of nulls, some of the value slots in buffer may
+  // be uninitialized, and this will cause valgrind warnings / potentially UB
+  memset(static_cast(buffer + values_read), 0,
+ (num_values - values_read) * sizeof(T));
+
+  // Add spacing for null entries. As we have filled the buffer from the front,
+  // we need to add the spacing from the back.
+  int values_to_move = values_read - 1;
+  // We stop early on one of two conditions:
+  // 1. There are no more null values that need spacing.  Note we infer this
+  // backwards, when 'i' is equal to 'values_to_move' it indicates
+  //all nulls have been consumed.
+  // 2. There are no more non-null values that need to move which indicates
+  //all remaining slots are null, so their exact value doesn't matter.
+  for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); 
i--) {
+if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
+  buffer[i] = buffer[values_to_move];
+  values_to_move--;
+}
+  }
+  return num_values;
+}
+
+#if defined(ARROW_HAVE_AVX512)
+template 
+int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* 
valid_bits,
+int64_t valid_bits_offset, T* output) {
+  assert(sizeof(T) == 4 || sizeof(T) == 8);  // Only support epi32 and epi64

Review comment:
   i see





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] sunchao commented on a change in pull request #6935: ARROW-8455: [Rust] Parquet Arrow column read on partially compatible files

2020-04-24 Thread GitBox


sunchao commented on a change in pull request #6935:
URL: https://github.com/apache/arrow/pull/6935#discussion_r414738189



##
File path: rust/parquet/src/column/reader.rs
##
@@ -190,15 +190,12 @@ impl ColumnReaderImpl {
 (self.num_buffered_values - self.num_decoded_values) as 
usize,
 );
 
-// Adjust batch size by taking into account how much space is 
left in
-// values slice or levels slices (if available)
-adjusted_size = min(adjusted_size, values.len() - values_read);
-if let Some(ref levels) = def_levels {
-adjusted_size = min(adjusted_size, levels.len() - 
levels_read);
-}
-if let Some(ref levels) = rep_levels {
-adjusted_size = min(adjusted_size, levels.len() - 
levels_read);
-}
+// Adjust batch size by taking into account how much data there
+// to read. As batch_size is also smaller than value and level
+// slices (if available), this ensures that available space is 
not
+// exceeded.
+adjusted_size = min(adjusted_size, batch_size - values_read);

Review comment:
   Yes. I just struggled to come up with a concrete example for this. :)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] sunchao commented on pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)

2020-04-24 Thread GitBox


sunchao commented on pull request #6949:
URL: https://github.com/apache/arrow/pull/6949#issuecomment-619140531


   >  It's the reader (file handle) that is passed to it that should be thread 
safe
   
   Is [file](https://doc.rust-lang.org/std/fs/struct.File.html) thread-safe? 
it's not obvious when reading the doc. Plus, here type parameter`R` can be 
anything that implements `ParquetReader` (with read, seek, length and try_clone 
capabilities) so you cannot assume all of them guarantees thread-safety.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] BryanCutler commented on a change in pull request #6323: ARROW-7610: [Java] Finish support for 64 bit int allocations

2020-04-24 Thread GitBox


BryanCutler commented on a change in pull request #6323:
URL: https://github.com/apache/arrow/pull/6323#discussion_r414695756



##
File path: 
java/memory/src/main/java/org/apache/arrow/memory/NettyAllocationManager.java
##
@@ -17,48 +17,97 @@
 
 package org.apache.arrow.memory;
 
-import org.apache.arrow.memory.util.LargeMemoryUtil;
-
 import io.netty.buffer.PooledByteBufAllocatorL;
 import io.netty.buffer.UnsafeDirectLittleEndian;
+import io.netty.util.internal.PlatformDependent;
 
 /**
- * The default implementation of AllocationManagerBase. The implementation is 
responsible for managing when memory
+ * The default implementation of {@link AllocationManager}. The implementation 
is responsible for managing when memory
  * is allocated and returned to the Netty-based PooledByteBufAllocatorL.
  */
 public class NettyAllocationManager extends AllocationManager {
 
   public static final Factory FACTORY = new Factory();
 
+  /**
+   * The default cut-off value for switching allocation strategies.
+   * If the request size is not greater than the cut-off value, we will 
allocate memory by
+   * {@link PooledByteBufAllocatorL} APIs,
+   * otherwise, we will use {@link PlatformDependent} APIs.
+   */
+  public static final long DEFAULT_ALLOCATION_CUTOFF_VALUE;
+
+  public static final String DEFAULT_ALLOCATION_CUTOFF_NAME = 
"default.allocation.cutoff.name";
+
+  static {
+long cutOffValue;
+try {
+  cutOffValue = 
Long.parseLong(System.getProperty(DEFAULT_ALLOCATION_CUTOFF_NAME));
+} catch (Exception e) {
+  cutOffValue = Integer.MAX_VALUE;
+}
+DEFAULT_ALLOCATION_CUTOFF_VALUE = cutOffValue;
+  }
+
   private static final PooledByteBufAllocatorL INNER_ALLOCATOR = new 
PooledByteBufAllocatorL();
   static final UnsafeDirectLittleEndian EMPTY = INNER_ALLOCATOR.empty;
   static final long CHUNK_SIZE = INNER_ALLOCATOR.getChunkSize();
 
-  private final int allocatedSize;
+  private final long allocatedSize;
   private final UnsafeDirectLittleEndian memoryChunk;
+  private final long allocatedAddress;
+
+  /**
+   * The cut-off value for switching allocation strategies.
+   */
+  private final long allocationCutOffValue;

Review comment:
   This should be an int because it can't go above `Int.MAX_VALUE`

##
File path: 
java/memory/src/main/java/org/apache/arrow/memory/NettyAllocationManager.java
##
@@ -17,48 +17,97 @@
 
 package org.apache.arrow.memory;
 
-import org.apache.arrow.memory.util.LargeMemoryUtil;
-
 import io.netty.buffer.PooledByteBufAllocatorL;
 import io.netty.buffer.UnsafeDirectLittleEndian;
+import io.netty.util.internal.PlatformDependent;
 
 /**
- * The default implementation of AllocationManagerBase. The implementation is 
responsible for managing when memory
+ * The default implementation of {@link AllocationManager}. The implementation 
is responsible for managing when memory
  * is allocated and returned to the Netty-based PooledByteBufAllocatorL.
  */
 public class NettyAllocationManager extends AllocationManager {
 
   public static final Factory FACTORY = new Factory();
 
+  /**
+   * The default cut-off value for switching allocation strategies.
+   * If the request size is not greater than the cut-off value, we will 
allocate memory by
+   * {@link PooledByteBufAllocatorL} APIs,
+   * otherwise, we will use {@link PlatformDependent} APIs.
+   */
+  public static final long DEFAULT_ALLOCATION_CUTOFF_VALUE;
+
+  public static final String DEFAULT_ALLOCATION_CUTOFF_NAME = 
"default.allocation.cutoff.name";
+
+  static {
+long cutOffValue;
+try {
+  cutOffValue = 
Long.parseLong(System.getProperty(DEFAULT_ALLOCATION_CUTOFF_NAME));
+} catch (Exception e) {
+  cutOffValue = Integer.MAX_VALUE;
+}
+DEFAULT_ALLOCATION_CUTOFF_VALUE = cutOffValue;
+  }
+
   private static final PooledByteBufAllocatorL INNER_ALLOCATOR = new 
PooledByteBufAllocatorL();
   static final UnsafeDirectLittleEndian EMPTY = INNER_ALLOCATOR.empty;
   static final long CHUNK_SIZE = INNER_ALLOCATOR.getChunkSize();
 
-  private final int allocatedSize;
+  private final long allocatedSize;
   private final UnsafeDirectLittleEndian memoryChunk;
+  private final long allocatedAddress;
+
+  /**
+   * The cut-off value for switching allocation strategies.
+   */
+  private final long allocationCutOffValue;
 
-  NettyAllocationManager(BaseAllocator accountingAllocator, int requestedSize) 
{
+  NettyAllocationManager(BaseAllocator accountingAllocator, long 
requestedSize, long allocationCutOffValue) {
 super(accountingAllocator);
-this.memoryChunk = INNER_ALLOCATOR.allocate(requestedSize);
-this.allocatedSize = memoryChunk.capacity();
+if (allocationCutOffValue > Integer.MAX_VALUE) {
+  throw new IllegalArgumentException("The cut-off value cannot be larger 
than Integer.MAX_VALUE");
+}
+this.allocationCutOffValue = allocationCutOffValue;
+
+if (requestedSize > allocationCutOffValue) {
+  this.memoryChunk 

[GitHub] [arrow] nealrichardson commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings

2020-04-24 Thread GitBox


nealrichardson commented on a change in pull request #7026:
URL: https://github.com/apache/arrow/pull/7026#discussion_r414680234



##
File path: r/src/expression.cpp
##
@@ -21,99 +21,97 @@
 
 // [[arrow::export]]
 std::shared_ptr dataset___expr__field_ref(std::string name) {
-  return std::make_shared(std::move(name));
+  return ds::field_ref(std::move(name));
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__equal(
+std::shared_ptr dataset___expr__equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__not_equal(
+std::shared_ptr dataset___expr__not_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::not_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__greater(
+std::shared_ptr dataset___expr__greater(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::greater(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__greater_equal(
+std::shared_ptr dataset___expr__greater_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::greater_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__less(
+std::shared_ptr dataset___expr__less(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::less(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__less_equal(
+std::shared_ptr dataset___expr__less_equal(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
   return ds::less_equal(lhs, rhs);
 }
 
 // [[arrow::export]]
-std::shared_ptr dataset___expr__in(
+std::shared_ptr dataset___expr__in(
 const std::shared_ptr& lhs,
 const std::shared_ptr& rhs) {
-  return std::make_shared(lhs->In(rhs));
+  return lhs->In(rhs).Copy();

Review comment:
   路 not a big deal to me





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me commented on a change in pull request #6306: ARROW-7705: [Rust] Initial sort implementation

2020-04-24 Thread GitBox


nevi-me commented on a change in pull request #6306:
URL: https://github.com/apache/arrow/pull/6306#discussion_r414663303



##
File path: rust/arrow/src/compute/kernels/sort.rs
##
@@ -0,0 +1,671 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines sort kernel for `ArrayRef`
+
+use std::cmp::Reverse;
+
+use crate::array::*;
+use crate::compute::take;
+use crate::datatypes::*;
+use crate::error::{ArrowError, Result};
+
+use TimeUnit::*;
+
+/// Sort the `ArrayRef` using `SortOptions`.
+///
+/// Performs a stable sort on values and indices, returning nulls after sorted 
valid values,
+/// while preserving the order of the nulls.
+///
+/// Returns and `ArrowError::ComputeError(String)` if the array type is either 
unsupported by `sort_to_indices` or `take`.

Review comment:
   Thanks, I'll merge this after CI completes





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me commented on a change in pull request #6306: ARROW-7705: [Rust] Initial sort implementation

2020-04-24 Thread GitBox


nevi-me commented on a change in pull request #6306:
URL: https://github.com/apache/arrow/pull/6306#discussion_r414662522



##
File path: rust/arrow/src/compute/kernels/sort.rs
##
@@ -0,0 +1,671 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines sort kernel for `ArrayRef`
+
+use std::cmp::Reverse;
+
+use crate::array::*;
+use crate::compute::take;
+use crate::datatypes::*;
+use crate::error::{ArrowError, Result};
+
+use TimeUnit::*;
+
+/// Sort the `ArrayRef` using `SortOptions`.
+///
+/// Performs a stable sort on values and indices, returning nulls after sorted 
valid values,
+/// while preserving the order of the nulls.
+///
+/// Returns and `ArrowError::ComputeError(String)` if the array type is either 
unsupported by `sort_to_indices` or `take`.

Review comment:
   ```suggestion
   /// Returns an `ArrowError::ComputeError(String)` if the array type is 
either unsupported by `sort_to_indices` or `take`.
   ```





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nealrichardson commented on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR

2020-04-24 Thread GitBox


nealrichardson commented on pull request #7028:
URL: https://github.com/apache/arrow/pull/7028#issuecomment-619074972


   I'm not worried about security risks in this particular case. If someone 
random person wants to rebase my PR on apache/arrow@master, great! Now I don't 
have to! While I see how other workflows that push might be vulnerable, this 
workflow is not open-ended, does not take any arguments, and it only pushes on 
success, so I don't understand how it could be exploited. Am I missing 
something?
   
   Involving Python here sounds like overkill.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] rdettai edited a comment on pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)

2020-04-24 Thread GitBox


rdettai edited a comment on pull request #6949:
URL: https://github.com/apache/arrow/pull/6949#issuecomment-618997029


   > Originally we designed it this way so that we can concurrently read 
multiple column chunks after obtaining file handle from a single row group. 
Since the file handle is shared between these we wanted to provide thread 
safety on top of it.
   
   I'm not sure to understand how this could be the concern of the 
`FileSource`. It's the reader (file handle) that is passed to it that should be 
thread safe, and `FileSource` should not create the Mutex himself.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] rdettai commented on pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)

2020-04-24 Thread GitBox


rdettai commented on pull request #6949:
URL: https://github.com/apache/arrow/pull/6949#issuecomment-618997029


   > Originally we designed it this way so that we can concurrently read 
multiple column chunks after obtaining file handle from a single row group. 
Since the file handle is shared between these we wanted to provide thread 
safety on top of it.
   
   I'm not sure to understand how this could be the concern of the 
`FileSource`. Its the reader (file handle) that is passed to it that should be 
thread safe, and he should not create the Mutex himself.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] fsaintjacques commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics

2020-04-24 Thread GitBox


fsaintjacques commented on pull request #7022:
URL: https://github.com/apache/arrow/pull/7022#issuecomment-618985822


   Could you accompany a script/utility to compute both metrics? Paired with 
toxiproxy, we could replicate S3 regions behavior with localhost.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] jianxind commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


jianxind commented on a change in pull request #7029:
URL: https://github.com/apache/arrow/pull/7029#discussion_r414505645



##
File path: cpp/src/arrow/util/spaced.h
##
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/bit_util.h"
+
+#ifdef ARROW_HAVE_AVX512
+#include 
+#endif  // ARROW_HAVE_AVX512
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+template 
+int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits,
+int64_t valid_bits_offset, T* output) {
+  int num_valid_values = 0;
+  arrow::internal::BitmapReader valid_bits_reader(valid_bits, 
valid_bits_offset,
+  num_values);
+  for (int32_t i = 0; i < num_values; i++) {
+if (valid_bits_reader.IsSet()) {
+  output[num_valid_values++] = values[i];
+}
+valid_bits_reader.Next();
+  }
+  return num_valid_values;
+}
+
+template 
+int DecodeSpacedScalar(T* buffer, int num_values, int null_count,
+   const uint8_t* valid_bits, int64_t valid_bits_offset) {
+  const int values_read = num_values - null_count;
+
+  // Depending on the number of nulls, some of the value slots in buffer may
+  // be uninitialized, and this will cause valgrind warnings / potentially UB
+  memset(static_cast(buffer + values_read), 0,
+ (num_values - values_read) * sizeof(T));
+
+  // Add spacing for null entries. As we have filled the buffer from the front,
+  // we need to add the spacing from the back.
+  int values_to_move = values_read - 1;
+  // We stop early on one of two conditions:
+  // 1. There are no more null values that need spacing.  Note we infer this
+  // backwards, when 'i' is equal to 'values_to_move' it indicates
+  //all nulls have been consumed.
+  // 2. There are no more non-null values that need to move which indicates
+  //all remaining slots are null, so their exact value doesn't matter.
+  for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); 
i--) {
+if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
+  buffer[i] = buffer[values_to_move];
+  values_to_move--;
+}
+  }
+  return num_values;
+}
+
+#if defined(ARROW_HAVE_AVX512)
+template 
+int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* 
valid_bits,
+int64_t valid_bits_offset, T* output) {
+  assert(sizeof(T) == 4 || sizeof(T) == 8);  // Only support epi32 and epi64

Review comment:
   Can't. The instantiation PutSpaced of other types still get to this 
place though it will never be reached.
   
   /mnt/github/arrow/cpp/src/arrow/util/spaced.h: In instantiation of 'int 
arrow::util::internal::PutSpacedAvx512Compress(const T*, int, const uint8_t*, 
int64_t, T*) [with T = parquet::Int96; uint8_t = unsigned char; int64_t = long 
int]':
   /mnt/github/arrow/cpp/src/arrow/util/spaced.h:224:38:   required from 'int 
arrow::util::internal::PutSpacedAvx512(const T*, int, const uint8_t*, int64_t, 
T*) [with T = parquet::Int96; uint8_t = unsigned char; int64_t = long int]'
   /mnt/github/arrow/cpp/src/arrow/util/spaced.h:246:28:   required from 'int 
arrow::util::internal::PutSpaced(const T*, int, const uint8_t*, int64_t, T*) 
[with T = parquet::Int96; uint8_t = unsigned char; int64_t = long int]'
   /mnt/github/arrow/cpp/src/parquet/encoding.cc:107:63:   required from 'void 
parquet::PlainEncoder::PutSpaced(const T*, int, const uint8_t*, int64_t) 
[with DType = parquet::PhysicalType<(parquet::Type::type)3>; 
parquet::PlainEncoder::T = parquet::Int96; uint8_t = unsigned char; 
int64_t = long int]'
   





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs edited a comment on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR

2020-04-24 Thread GitBox


kszucs edited a comment on pull request #7028:
URL: https://github.com/apache/arrow/pull/7028#issuecomment-618944665


   There is another security constraint about this approach: anyone can trigger 
a rebase on the PR not just the participants / committers. To resolve that you 
need to check `author_association` in the event payload, see the python bot's 
implementation 
[here](https://github.com/apache/arrow/blob/master/dev/archery/archery/bot.py#L179).
   
   We could implement a feature to the python bot to trigger bash scripts to 
make the contributing to the comment bot easier while reusing the existing 
comment handling and responding mechanism.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs edited a comment on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR

2020-04-24 Thread GitBox


kszucs edited a comment on pull request #7028:
URL: https://github.com/apache/arrow/pull/7028#issuecomment-618944665


   There is another security constraint about this approach: anyone can trigger 
a rebase on the PR not just the participants / committers. To resolve that you 
need to check `author_association` in the event payload, see the python bot's 
implementation 
[here](https://github.com/apache/arrow/blob/master/dev/archery/archery/bot.py#L179).



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kszucs commented on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR

2020-04-24 Thread GitBox


kszucs commented on pull request #7028:
URL: https://github.com/apache/arrow/pull/7028#issuecomment-618944665


   There is another security constraint about this approach: anyone can trigger 
a rebase on the PR not just the participants. To resolve that you need to check 
`author_association` in the event payload, see the python bot's implementation 
[here](https://github.com/apache/arrow/blob/master/dev/archery/archery/bot.py#L179).



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] nevi-me commented on pull request #7018: ARROW-8536: [Rust] [Flight] Check in proto file, conditional build if file exists

2020-04-24 Thread GitBox


nevi-me commented on pull request #7018:
URL: https://github.com/apache/arrow/pull/7018#issuecomment-618942929


   > @nevi-me This is looking good, but the generated source file needs the ASF 
header. CI is failing with ` apache-rat license violation: 
rust/arrow-flight/src/arrow.flight.protocol.rs`.
   
   I've instead added an exception, I hope that's fine



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7030:
URL: https://github.com/apache/arrow/pull/7030#issuecomment-618919224


   https://issues.apache.org/jira/browse/ARROW-7808



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


pitrou commented on pull request #7029:
URL: https://github.com/apache/arrow/pull/7029#issuecomment-618917554


   I'd gladly see a AVX2 or SSE version indeed, as many CPUs don't have AVX512.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] zhztheplayer opened a new pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API

2020-04-24 Thread GitBox


zhztheplayer opened a new pull request #7030:
URL: https://github.com/apache/arrow/pull/7030


   Add following Datasets APIs to Java:
   
   - DatasetFactory
   - Dataset
   - Scanner
   - ScanTask
   
   Add a native dataset path to bridge c++ Datasets components to Java:
   
   - NativeDatasetFactory
   - NativeDataset
   - NativeScanner
   - NativeScanTask
   
   Add a simple reference `NativeDatasetFactory` implementation, which is 
already being useful in some scenarios:
   
   - SingleFileDatasetFactory
   - SingleFileDataset
   
   
   Unit tests are based on `SingleFileDataset`/`NativeDatasetFactory`. I may 
add more test cases under this PR in a time.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kiszk commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


kiszk commented on a change in pull request #7029:
URL: https://github.com/apache/arrow/pull/7029#discussion_r414434809



##
File path: cpp/src/arrow/util/spaced.h
##
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/bit_util.h"
+
+#ifdef ARROW_HAVE_AVX512
+#include 
+#endif  // ARROW_HAVE_AVX512
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+template 
+int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits,
+int64_t valid_bits_offset, T* output) {
+  int num_valid_values = 0;
+  arrow::internal::BitmapReader valid_bits_reader(valid_bits, 
valid_bits_offset,
+  num_values);
+  for (int32_t i = 0; i < num_values; i++) {
+if (valid_bits_reader.IsSet()) {
+  output[num_valid_values++] = values[i];
+}
+valid_bits_reader.Next();
+  }
+  return num_valid_values;
+}
+
+template 
+int DecodeSpacedScalar(T* buffer, int num_values, int null_count,
+   const uint8_t* valid_bits, int64_t valid_bits_offset) {
+  const int values_read = num_values - null_count;
+
+  // Depending on the number of nulls, some of the value slots in buffer may
+  // be uninitialized, and this will cause valgrind warnings / potentially UB
+  memset(static_cast(buffer + values_read), 0,
+ (num_values - values_read) * sizeof(T));
+
+  // Add spacing for null entries. As we have filled the buffer from the front,
+  // we need to add the spacing from the back.
+  int values_to_move = values_read - 1;
+  // We stop early on one of two conditions:
+  // 1. There are no more null values that need spacing.  Note we infer this
+  // backwards, when 'i' is equal to 'values_to_move' it indicates
+  //all nulls have been consumed.
+  // 2. There are no more non-null values that need to move which indicates
+  //all remaining slots are null, so their exact value doesn't matter.
+  for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); 
i--) {
+if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
+  buffer[i] = buffer[values_to_move];
+  values_to_move--;
+}
+  }
+  return num_values;
+}
+
+#if defined(ARROW_HAVE_AVX512)
+template 
+int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* 
valid_bits,
+int64_t valid_bits_offset, T* output) {
+  assert(sizeof(T) == 4 || sizeof(T) == 8);  // Only support epi32 and epi64

Review comment:
   How about using `staic_assert()`?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] kiszk commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


kiszk commented on a change in pull request #7029:
URL: https://github.com/apache/arrow/pull/7029#discussion_r414434434



##
File path: cpp/src/arrow/util/spaced.h
##
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/bit_util.h"
+
+#ifdef ARROW_HAVE_AVX512
+#include 
+#endif  // ARROW_HAVE_AVX512
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+template 
+int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits,
+int64_t valid_bits_offset, T* output) {
+  int num_valid_values = 0;
+  arrow::internal::BitmapReader valid_bits_reader(valid_bits, 
valid_bits_offset,
+  num_values);
+  for (int32_t i = 0; i < num_values; i++) {
+if (valid_bits_reader.IsSet()) {
+  output[num_valid_values++] = values[i];
+}
+valid_bits_reader.Next();
+  }
+  return num_valid_values;
+}
+
+template 
+int DecodeSpacedScalar(T* buffer, int num_values, int null_count,
+   const uint8_t* valid_bits, int64_t valid_bits_offset) {
+  const int values_read = num_values - null_count;
+
+  // Depending on the number of nulls, some of the value slots in buffer may
+  // be uninitialized, and this will cause valgrind warnings / potentially UB
+  memset(static_cast(buffer + values_read), 0,
+ (num_values - values_read) * sizeof(T));
+
+  // Add spacing for null entries. As we have filled the buffer from the front,
+  // we need to add the spacing from the back.
+  int values_to_move = values_read - 1;
+  // We stop early on one of two conditions:
+  // 1. There are no more null values that need spacing.  Note we infer this
+  // backwards, when 'i' is equal to 'values_to_move' it indicates
+  //all nulls have been consumed.
+  // 2. There are no more non-null values that need to move which indicates
+  //all remaining slots are null, so their exact value doesn't matter.
+  for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); 
i--) {
+if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
+  buffer[i] = buffer[values_to_move];
+  values_to_move--;
+}
+  }
+  return num_values;
+}
+
+#if defined(ARROW_HAVE_AVX512)
+template 
+int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* 
valid_bits,
+int64_t valid_bits_offset, T* output) {
+  assert(sizeof(T) == 4 || sizeof(T) == 8);  // Only support epi32 and epi64
+  constexpr int kBatchSize = sizeof(__m512i) / sizeof(T);
+  constexpr int kBatchValidBytes = kBatchSize / 8;
+  int num_valid_values = 0;
+  int idx_values = 0;
+  int64_t idx_valid_bits = valid_bits_offset;
+
+  // First handle the front suffix
+  const int64_t offset_suffix_front = 8 - (valid_bits_offset % 8);
+  for (int64_t i = 0; (i < offset_suffix_front) && (idx_values < num_values); 
i++) {
+if (BitUtil::GetBit(valid_bits, idx_valid_bits)) {
+  output[num_valid_values] = values[idx_values];
+  num_valid_values++;
+}
+idx_values++;
+idx_valid_bits++;
+  }
+
+  // The parts can fill into batches
+  uint8_t valid_count;
+  int64_t idx_valid_bytes = BitUtil::BytesForBits(idx_valid_bits + 1) - 1;
+  static const __m512i zero = _mm512_set_epi64(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
0x0, 0x0);
+  while (num_values - idx_values >= kBatchSize) {
+// count the valid numbers of one batch.
+valid_count = BitUtil::kBytePopcount[valid_bits[idx_valid_bytes]];
+if (kBatchValidBytes > 1) {
+  valid_count += BitUtil::kBytePopcount[valid_bits[idx_valid_bytes + 1]];
+}
+
+// pack the data
+if (valid_count > 0) {
+  __m512i src = _mm512_loadu_si512(values + idx_values);
+  __m512i result;
+  if (sizeof(T) == 4) {
+// 16 float for one m512i block, two bytes in valid_bits
+__mmask16 k = *(reinterpret_cast(valid_bits + 
idx_valid_bytes));
+result = _mm512_mask_compress_epi32(zero, k, src);
+  } else {
+// 8 double for one m512i block, one byte in valid_bits
+__mmask8 k = *(valid_bits + idx_valid_bytes);
+result = _mm512_mask_compress_epi64(zero, k, src);
+  }
+
+  

[GitHub] [arrow] rdettai commented on a change in pull request #6935: ARROW-8455: [Rust] Parquet Arrow column read on partially compatible files

2020-04-24 Thread GitBox


rdettai commented on a change in pull request #6935:
URL: https://github.com/apache/arrow/pull/6935#discussion_r414417688



##
File path: rust/parquet/src/column/reader.rs
##
@@ -190,15 +190,12 @@ impl ColumnReaderImpl {
 (self.num_buffered_values - self.num_decoded_values) as 
usize,
 );
 
-// Adjust batch size by taking into account how much space is 
left in
-// values slice or levels slices (if available)
-adjusted_size = min(adjusted_size, values.len() - values_read);
-if let Some(ref levels) = def_levels {
-adjusted_size = min(adjusted_size, levels.len() - 
levels_read);
-}
-if let Some(ref levels) = rep_levels {
-adjusted_size = min(adjusted_size, levels.len() - 
levels_read);
-}
+// Adjust batch size by taking into account how much data there
+// to read. As batch_size is also smaller than value and level
+// slices (if available), this ensures that available space is 
not
+// exceeded.
+adjusted_size = min(adjusted_size, batch_size - values_read);

Review comment:
   As stated in my PR comment, the `read_batch` function can receive any 
combination of `batch_size`, `def_levels.len()`, `rep_levels.len()` and 
`values.len()`. If the `batch_size` is the limiting factor your 
`iter_batch_size` might end up larger than the `batch_size`. This happened to 
me when called from `record_reader.rs` on a parquet file with relatively small 
row groups (500k rows). I did not manage to reproduce the phenomenon with a 
mock data file though ... :-/





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] jianxind commented on pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


jianxind commented on pull request #7029:
URL: https://github.com/apache/arrow/pull/7029#issuecomment-618855696


   cc @emkornfield 
   
   The AVX512 path is straightforward as the helper of 
mask_compress/mask_expand API provide by AVX512. For potential path-finding of 
SSE/AVX2, as you pointed in the Jira, a solution with fixed lookup table may 
help, I will work the chance then but it definitely need take more time thus I 
commit this done part firstly.
   
   Below is the benchmark data on Avx512 device before after the intrinsics:
   
   Before:
   BM_PlainEncodingSpacedFloat/1024  1471 ns 1469 ns   
476373 bytes_per_second=2.59603G/s
   BM_PlainEncodingSpacedDouble/1024 1498 ns 1496 ns   
468131 bytes_per_second=5.09834G/s
   BM_PlainDecodingSpacedFloat/1024  1266 ns 1265 ns   
554320 bytes_per_second=3.01623G/s
   BM_PlainDecodingSpacedDouble/1024  920 ns  919 ns   
759151 bytes_per_second=8.30509G/s
   
   After:
   BM_PlainEncodingSpacedFloat/1024   717 ns  716 ns   
973249 bytes_per_second=5.32424G/s
   BM_PlainEncodingSpacedDouble/1024  989 ns  988 ns   
710096 bytes_per_second=7.7235G/s
   BM_PlainDecodingSpacedFloat/1024   217 ns  217 ns  
3233406 bytes_per_second=17.613G/s
   BM_PlainDecodingSpacedDouble/1024  309 ns  309 ns  
2267740 bytes_per_second=24.7257G/s



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] github-actions[bot] commented on pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


github-actions[bot] commented on pull request #7029:
URL: https://github.com/apache/arrow/pull/7029#issuecomment-618853399


   https://issues.apache.org/jira/browse/ARROW-8579



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] jianxind opened a new pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.

2020-04-24 Thread GitBox


jianxind opened a new pull request #7029:
URL: https://github.com/apache/arrow/pull/7029


   1. Create the spaced encoding/decoding benchmark items.
   2. Create unittest for spaced API SIMD implementation.
   3. Move spaced scalar/SIMD to a new head file.
   4. Add the path of AVX512 epi32 and epi64 using mask_compress_/mask_expand_ 
API.
   
   Signed-off-by: Frank Du 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] liyafan82 commented on a change in pull request #6323: ARROW-7610: [Java] Finish support for 64 bit int allocations

2020-04-24 Thread GitBox


liyafan82 commented on a change in pull request #6323:
URL: https://github.com/apache/arrow/pull/6323#discussion_r414332379



##
File path: 
java/memory/src/test/java/org/apache/arrow/memory/TestLargeArrowBuf.java
##
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.memory;
+
+import static org.junit.Assert.assertEquals;
+
+import io.netty.buffer.ArrowBuf;
+
+/**
+ * Integration test for large (more than 2GB) {@link io.netty.buffer.ArrowBuf}.
+ * To run this test, please
+ *Make sure there are 4GB memory available in the system.
+ * 
+ *   Make sure the default allocation manager type is unsafe.
+ *   This can be achieved by the environmental variable or system property.
+ *   The details can be found in {@link DefaultAllocationManagerOption}.
+ * 
+ */
+public class TestLargeArrowBuf {
+
+  private static void testLargeArrowBuf() {
+final long bufSize = 4 * 1024 * 1024 * 1024L;
+try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+ ArrowBuf largeBuf = allocator.buffer(bufSize)) {
+  assertEquals(bufSize, largeBuf.capacity());
+  System.out.println("Successfully allocated a buffer with capacity " + 
largeBuf.capacity());
+
+  for (long i = 0; i < bufSize / 8; i++) {
+largeBuf.setLong(i * 8, i);
+
+if ((i + 1) % 1 == 0) {
+  System.out.println("Successfully written " + (i + 1) + " long 
words");
+}
+  }
+  System.out.println("Successfully written " + (bufSize / 8) + " long 
words");
+
+  for (long i = 0; i < bufSize / 8; i++) {
+long val = largeBuf.getLong(i * 8);
+assertEquals(i, val);
+
+if ((i + 1) % 1 == 0) {
+  System.out.println("Successfully read " + (i + 1) + " long words");
+}
+  }
+  System.out.println("Successfully read " + (bufSize / 8) + " long words");
+}
+System.out.println("Successfully released the large buffer.");
+  }
+
+  public static void main(String[] args) {

Review comment:
   Sounds good. 
   I have revised the code to make the cut-off value configurable, and added 
cases to test the scenarios when the request size is below/above the cut-off 
value. Please see if it looks good to you. Thanks. 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] liyafan82 commented on a change in pull request #6323: ARROW-7610: [Java] Finish support for 64 bit int allocations

2020-04-24 Thread GitBox


liyafan82 commented on a change in pull request #6323:
URL: https://github.com/apache/arrow/pull/6323#discussion_r414330656



##
File path: 
java/memory/src/main/java/org/apache/arrow/memory/NettyAllocationManager.java
##
@@ -34,31 +33,34 @@
   static final UnsafeDirectLittleEndian EMPTY = INNER_ALLOCATOR.empty;
   static final long CHUNK_SIZE = INNER_ALLOCATOR.getChunkSize();
 
-  private final int allocatedSize;
+  private final long allocatedSize;
   private final UnsafeDirectLittleEndian memoryChunk;
+  private final long allocatedAddress;
 
-  NettyAllocationManager(BaseAllocator accountingAllocator, int requestedSize) 
{
+  NettyAllocationManager(BaseAllocator accountingAllocator, long 
requestedSize) {
 super(accountingAllocator);
-this.memoryChunk = INNER_ALLOCATOR.allocate(requestedSize);
-this.allocatedSize = memoryChunk.capacity();
-  }
-
-  /**
-   * Get the underlying memory chunk managed by this AllocationManager.
-   * @return buffer
-   */
-  UnsafeDirectLittleEndian getMemoryChunk() {
-return memoryChunk;
+if (requestedSize > Integer.MAX_VALUE) {
+  memoryChunk = null;

Review comment:
   Revised accordingly. Thank you. 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] liyafan82 commented on a change in pull request #6323: ARROW-7610: [Java] Finish support for 64 bit int allocations

2020-04-24 Thread GitBox


liyafan82 commented on a change in pull request #6323:
URL: https://github.com/apache/arrow/pull/6323#discussion_r414330471



##
File path: 
java/memory/src/main/java/org/apache/arrow/memory/NettyAllocationManager.java
##
@@ -34,31 +33,34 @@
   static final UnsafeDirectLittleEndian EMPTY = INNER_ALLOCATOR.empty;
   static final long CHUNK_SIZE = INNER_ALLOCATOR.getChunkSize();
 
-  private final int allocatedSize;
+  private final long allocatedSize;
   private final UnsafeDirectLittleEndian memoryChunk;
+  private final long allocatedAddress;
 
-  NettyAllocationManager(BaseAllocator accountingAllocator, int requestedSize) 
{
+  NettyAllocationManager(BaseAllocator accountingAllocator, long 
requestedSize) {
 super(accountingAllocator);
-this.memoryChunk = INNER_ALLOCATOR.allocate(requestedSize);
-this.allocatedSize = memoryChunk.capacity();
-  }
-
-  /**
-   * Get the underlying memory chunk managed by this AllocationManager.
-   * @return buffer
-   */
-  UnsafeDirectLittleEndian getMemoryChunk() {
-return memoryChunk;
+if (requestedSize > Integer.MAX_VALUE) {
+  memoryChunk = null;
+  allocatedAddress = PlatformDependent.allocateMemory(requestedSize);
+} else {
+  this.memoryChunk = INNER_ALLOCATOR.allocate(requestedSize);
+  allocatedAddress = memoryChunk.memoryAddress();

Review comment:
   Sure. Revised. Please check. 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] liyafan82 commented on a change in pull request #6323: ARROW-7610: [Java] Finish support for 64 bit int allocations

2020-04-24 Thread GitBox


liyafan82 commented on a change in pull request #6323:
URL: https://github.com/apache/arrow/pull/6323#discussion_r414330317



##
File path: 
java/memory/src/main/java/org/apache/arrow/memory/NettyAllocationManager.java
##
@@ -34,31 +33,34 @@
   static final UnsafeDirectLittleEndian EMPTY = INNER_ALLOCATOR.empty;
   static final long CHUNK_SIZE = INNER_ALLOCATOR.getChunkSize();
 
-  private final int allocatedSize;
+  private final long allocatedSize;
   private final UnsafeDirectLittleEndian memoryChunk;
+  private final long allocatedAddress;
 
-  NettyAllocationManager(BaseAllocator accountingAllocator, int requestedSize) 
{
+  NettyAllocationManager(BaseAllocator accountingAllocator, long 
requestedSize) {
 super(accountingAllocator);
-this.memoryChunk = INNER_ALLOCATOR.allocate(requestedSize);
-this.allocatedSize = memoryChunk.capacity();
-  }
-
-  /**
-   * Get the underlying memory chunk managed by this AllocationManager.
-   * @return buffer
-   */
-  UnsafeDirectLittleEndian getMemoryChunk() {

Review comment:
   Sounds good. Reverted this accordingly. 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org