iajoiner commented on a change in pull request #9702:
URL: https://github.com/apache/arrow/pull/9702#discussion_r791340693



##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have getMaxSize() 
because they use VectorizedRowBatch from Apache Hive itself (see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java)

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have 
`getMaxSize()` because they use `VectorizedRowBatch` from Apache Hive itself 
(see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java)

##########
File path: python/pyarrow/orc.py
##########
@@ -118,21 +186,93 @@ def read(self, columns=None):
         return self.reader.read(columns=columns)
 
 
-class ORCWriter:
-    """
-    Writer interface for a single ORC file
+_orc_writer_args_docs = """file_version : {"0.11", "0.12"}, default "0.12"
+    Determine which ORC file version to use.
+    `Hive 0.11 / ORC v0 <https://orc.apache.org/specification/ORCv0/>`_
+    is the older version
+    while `Hive 0.12 / ORC v1 <https://orc.apache.org/specification/ORCv1/>`_
+    is the newer one.
+batch_size : int, default 1024
+    Number of rows the ORC writer writes at a time.
+stripe_size : int, default 64 * 1024 * 1024
+    Size of each ORC stripe.

Review comment:
       Really thanks! Fixed in my latest PR. :)

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have 
`getMaxSize()` because they use `VectorizedRowBatch` from Apache Hive itself 
(see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java)
   
   In Hive I can find the DEFAULT_SIZE here:
   
https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java#62

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have 
`getMaxSize()` because they use `VectorizedRowBatch` from Apache Hive itself 
(see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java)
   
   In Hive I can find the DEFAULT_SIZE here:
   
https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java#L62

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have 
`getMaxSize()` because they use `VectorizedRowBatch` from Apache Hive itself 
(see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java)
   
   In Hive I can find the DEFAULT_SIZE here:
   
https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java#L62
   
   So the Hive community apparently believes that 1024 is a good `batch_size`.

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have 
`getMaxSize()` because they use `VectorizedRowBatch` from Apache Hive itself 
(see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java#L21)
   
   In Hive I can find the DEFAULT_SIZE here:
   
https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java#L62
   
   So the Hive community apparently believes that 1024 is a good `batch_size`.

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have 
`getMaxSize()` because they use `VectorizedRowBatch` from Apache Hive itself 
(see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java#L21)
   
   In Hive I can find the DEFAULT_SIZE here:
   
https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java#L62
   
   So the Hive community apparently believes that 1024 is a good `batch_size`. 
Not sure whether we exactly want to follow them..

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == 
right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Thanks for pointing this out! 1024 is from the example here: 
https://orc.apache.org/docs/core-cpp.html Oddly in Java they have 
`getMaxSize()` because they use `VectorizedRowBatch` from Apache Hive itself 
(see 
https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/TypeDescription.java#L21)
   
   In Hive I can find the DEFAULT_SIZE here:
   
https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java#L62
   
   So the Hive community apparently believes that 1024 is a good `batch_size`. 
Not sure whether we exactly want to follow them since we aren’t integrated with 
Hive.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to