This is an automated email from the ASF dual-hosted git repository.
philo-he pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 6e04b4a4bb [MINOR][VL] Remove modify_arrow_dataset_scan_option.patch
(#12148)
6e04b4a4bb is described below
commit 6e04b4a4bb25197d0afc4b3f9e7c7803ba9a566e
Author: Kent Yao <[email protected]>
AuthorDate: Tue Jun 9 05:10:08 2026 +0800
[MINOR][VL] Remove modify_arrow_dataset_scan_option.patch (#12148)
---
dev/build-arrow.sh | 1 -
ep/build-velox/src/get-velox.sh | 2 -
.../src/modify_arrow_dataset_scan_option.patch | 883 ---------------------
3 files changed, 886 deletions(-)
diff --git a/dev/build-arrow.sh b/dev/build-arrow.sh
index 54c6faaf33..1bf4ef9d08 100755
--- a/dev/build-arrow.sh
+++ b/dev/build-arrow.sh
@@ -31,7 +31,6 @@ function prepare_arrow_build() {
#wget_and_untar
https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz
arrow_ep
cd arrow_ep
patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow.patch
- patch -p1 <
$CURRENT_DIR/../ep/build-velox/src/modify_arrow_dataset_scan_option.patch
patch -p1 < $CURRENT_DIR/../ep/build-velox/src/cmake-compatibility.patch
patch -p1 < $CURRENT_DIR/../ep/build-velox/src/support_ibm_power.patch
popd
diff --git a/ep/build-velox/src/get-velox.sh b/ep/build-velox/src/get-velox.sh
index 240be716ae..4755a2a5dc 100755
--- a/ep/build-velox/src/get-velox.sh
+++ b/ep/build-velox/src/get-velox.sh
@@ -146,10 +146,8 @@ function apply_compilation_fixes {
SUDO_CMD="sudo"
fi
$SUDO_CMD cp ${CURRENT_DIR}/modify_arrow.patch
${VELOX_HOME}/CMake/resolve_dependency_modules/arrow/
- $SUDO_CMD cp ${CURRENT_DIR}/modify_arrow_dataset_scan_option.patch
${VELOX_HOME}/CMake/resolve_dependency_modules/arrow/
git add
${VELOX_HOME}/CMake/resolve_dependency_modules/arrow/modify_arrow.patch # to
avoid the file from being deleted by git clean -dffx :/
- git add
${VELOX_HOME}/CMake/resolve_dependency_modules/arrow/modify_arrow_dataset_scan_option.patch
# to avoid the file from being deleted by git clean -dffx :/
}
function setup_linux {
diff --git a/ep/build-velox/src/modify_arrow_dataset_scan_option.patch
b/ep/build-velox/src/modify_arrow_dataset_scan_option.patch
deleted file mode 100644
index 4af78c030c..0000000000
--- a/ep/build-velox/src/modify_arrow_dataset_scan_option.patch
+++ /dev/null
@@ -1,883 +0,0 @@
-diff --git a/cpp/src/arrow/dataset/file_csv.cc
b/cpp/src/arrow/dataset/file_csv.cc
-index 09ab77572..f09377cf9 100644
---- a/cpp/src/arrow/dataset/file_csv.cc
-+++ b/cpp/src/arrow/dataset/file_csv.cc
-@@ -24,6 +24,7 @@
- #include <unordered_set>
- #include <utility>
-
-+#include "arrow/c/bridge.h"
- #include "arrow/csv/options.h"
- #include "arrow/csv/parser.h"
- #include "arrow/csv/reader.h"
-@@ -52,6 +53,9 @@ using internal::Executor;
- using internal::SerialExecutor;
-
- namespace dataset {
-+namespace {
-+inline bool parseBool(const std::string& value) { return value == "true" ?
true : false; }
-+} // namespace
-
- struct CsvInspectedFragment : public InspectedFragment {
- CsvInspectedFragment(std::vector<std::string> column_names,
-@@ -503,5 +507,33 @@ Future<> CsvFileWriter::FinishInternal() {
- return Status::OK();
- }
-
-+Result<std::shared_ptr<FragmentScanOptions>> CsvFragmentScanOptions::from(
-+ const std::unordered_map<std::string, std::string>& configs) {
-+ std::shared_ptr<CsvFragmentScanOptions> options =
-+ std::make_shared<CsvFragmentScanOptions>();
-+ for (auto const& it : configs) {
-+ auto& key = it.first;
-+ auto& value = it.second;
-+ if (key == "delimiter") {
-+ options->parse_options.delimiter = value.data()[0];
-+ } else if (key == "quoting") {
-+ options->parse_options.quoting = parseBool(value);
-+ } else if (key == "column_types") {
-+ int64_t schema_address = std::stol(value);
-+ ArrowSchema* cSchema = reinterpret_cast<ArrowSchema*>(schema_address);
-+ ARROW_ASSIGN_OR_RAISE(auto schema, arrow::ImportSchema(cSchema));
-+ auto& column_types = options->convert_options.column_types;
-+ for (auto field : schema->fields()) {
-+ column_types[field->name()] = field->type();
-+ }
-+ } else if (key == "strings_can_be_null") {
-+ options->convert_options.strings_can_be_null = parseBool(value);
-+ } else {
-+ return Status::Invalid("Config " + it.first + "is not supported.");
-+ }
-+ }
-+ return options;
-+}
-+
- } // namespace dataset
- } // namespace arrow
-diff --git a/cpp/src/arrow/dataset/file_csv.h
b/cpp/src/arrow/dataset/file_csv.h
-index 42e3fd724..4d2825183 100644
---- a/cpp/src/arrow/dataset/file_csv.h
-+++ b/cpp/src/arrow/dataset/file_csv.h
-@@ -85,6 +85,9 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
- struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
- std::string type_name() const override { return kCsvTypeName; }
-
-+ static Result<std::shared_ptr<FragmentScanOptions>> from(
-+ const std::unordered_map<std::string, std::string>& configs);
-+
- using StreamWrapFunc =
std::function<Result<std::shared_ptr<io::InputStream>>(
- std::shared_ptr<io::InputStream>)>;
-
-diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc
b/cpp/src/arrow/engine/substrait/expression_internal.cc
-index 5d892af9a..0f8b0448b 100644
---- a/cpp/src/arrow/engine/substrait/expression_internal.cc
-+++ b/cpp/src/arrow/engine/substrait/expression_internal.cc
-@@ -1337,5 +1337,17 @@ Result<std::unique_ptr<substrait::Expression>> ToProto(
- return std::move(out);
- }
-
-+Status FromProto(const substrait::Expression::Literal& literal,
-+ std::unordered_map<std::string, std::string>& out) {
-+ ARROW_RETURN_IF(!literal.has_map(), Status::Invalid("Literal does not have
a map."));
-+ auto literalMap = literal.map();
-+ auto size = literalMap.key_values_size();
-+ for (auto i = 0; i < size; i++) {
-+ substrait::Expression_Literal_Map_KeyValue keyValue =
literalMap.key_values(i);
-+ out.emplace(keyValue.key().string(), keyValue.value().string());
-+ }
-+ return Status::OK();
-+}
-+
- } // namespace engine
- } // namespace arrow
-diff --git a/cpp/src/arrow/engine/substrait/expression_internal.h
b/cpp/src/arrow/engine/substrait/expression_internal.h
-index 2ce2ee76a..9be81b7ab 100644
---- a/cpp/src/arrow/engine/substrait/expression_internal.h
-+++ b/cpp/src/arrow/engine/substrait/expression_internal.h
-@@ -61,5 +61,9 @@ ARROW_ENGINE_EXPORT
- Result<SubstraitCall> FromProto(const substrait::AggregateFunction&, bool
is_hash,
- const ExtensionSet&, const
ConversionOptions&);
-
-+ARROW_ENGINE_EXPORT
-+Status FromProto(const substrait::Expression::Literal& literal,
-+ std::unordered_map<std::string, std::string>& out);
-+
- } // namespace engine
- } // namespace arrow
-diff --git a/cpp/src/arrow/engine/substrait/serde.cc
b/cpp/src/arrow/engine/substrait/serde.cc
-index 9e670f121..02e5c7171 100644
---- a/cpp/src/arrow/engine/substrait/serde.cc
-+++ b/cpp/src/arrow/engine/substrait/serde.cc
-@@ -247,6 +247,16 @@ Result<BoundExpressions> DeserializeExpressions(
- return FromProto(extended_expression, ext_set_out, conversion_options,
registry);
- }
-
-+Status DeserializeMap(const Buffer& buf,
-+ std::unordered_map<std::string, std::string>& out) {
-+ // ARROW_ASSIGN_OR_RAISE(auto advanced_extension,
-+ //
ParseFromBuffer<substrait::extensions::AdvancedExtension>(buf));
-+ // return FromProto(advanced_extension, out);
-+ ARROW_ASSIGN_OR_RAISE(auto literal,
-+ ParseFromBuffer<substrait::Expression::Literal>(buf));
-+ return FromProto(literal, out);
-+}
-+
- namespace {
-
- Result<std::shared_ptr<acero::ExecPlan>> MakeSingleDeclarationPlan(
-diff --git a/cpp/src/arrow/engine/substrait/serde.h
b/cpp/src/arrow/engine/substrait/serde.h
-index ab749f4a6..6312ec239 100644
---- a/cpp/src/arrow/engine/substrait/serde.h
-+++ b/cpp/src/arrow/engine/substrait/serde.h
-@@ -23,6 +23,7 @@
- #include <memory>
- #include <string>
- #include <string_view>
-+#include <unordered_map>
- #include <vector>
-
- #include "arrow/compute/type_fwd.h"
-@@ -183,6 +184,9 @@ ARROW_ENGINE_EXPORT Result<BoundExpressions>
DeserializeExpressions(
- const ConversionOptions& conversion_options = {},
- ExtensionSet* ext_set_out = NULLPTR);
-
-+ARROW_ENGINE_EXPORT Status
-+DeserializeMap(const Buffer& buf, std::unordered_map<std::string,
std::string>& out);
-+
- /// \brief Deserializes a Substrait Type message to the corresponding Arrow
type
- ///
- /// \param[in] buf a buffer containing the protobuf serialization of a
Substrait Type
-diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
-index d4d3e2c0f..ce72eaa1f 100644
---- a/java/dataset/pom.xml
-+++ b/java/dataset/pom.xml
-@@ -25,9 +25,10 @@
- <packaging>jar</packaging>
- <properties>
- <arrow.cpp.build.dir>../../../cpp/release-build/</arrow.cpp.build.dir>
-- <protobuf.version>2.5.0</protobuf.version>
- <parquet.version>1.11.0</parquet.version>
- <avro.version>1.11.3</avro.version>
-+ <substrait.version>0.31.0</substrait.version>
-+ <protobuf.version>3.25.3</protobuf.version>
- </properties>
-
- <dependencies>
-@@ -47,6 +48,18 @@
- <artifactId>arrow-c-data</artifactId>
- <scope>compile</scope>
- </dependency>
-+ <dependency>
-+ <groupId>io.substrait</groupId>
-+ <artifactId>core</artifactId>
-+ <version>${substrait.version}</version>
-+ <scope>provided</scope>
-+ </dependency>
-+ <dependency>
-+ <groupId>com.google.protobuf</groupId>
-+ <artifactId>protobuf-java</artifactId>
-+ <version>${protobuf.version}</version>
-+ <scope>provided</scope>
-+ </dependency>
- <dependency>
- <groupId>org.apache.arrow</groupId>
- <artifactId>arrow-memory-netty</artifactId>
-diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc
b/java/dataset/src/main/cpp/jni_wrapper.cc
-index 8d7dafd84..89cdc39fe 100644
---- a/java/dataset/src/main/cpp/jni_wrapper.cc
-+++ b/java/dataset/src/main/cpp/jni_wrapper.cc
-@@ -25,6 +25,7 @@
- #include "arrow/c/helpers.h"
- #include "arrow/dataset/api.h"
- #include "arrow/dataset/file_base.h"
-+#include "arrow/dataset/file_csv.h"
- #include "arrow/filesystem/localfs.h"
- #include "arrow/filesystem/path_util.h"
- #ifdef ARROW_S3
-@@ -122,6 +123,19 @@
arrow::Result<std::shared_ptr<arrow::dataset::FileFormat>> GetFileFormat(
- }
- }
-
-+arrow::Result<std::shared_ptr<arrow::dataset::FragmentScanOptions>>
-+GetFragmentScanOptions(jint file_format_id,
-+ const std::unordered_map<std::string, std::string>&
configs) {
-+ switch (file_format_id) {
-+#ifdef ARROW_CSV
-+ case 3:
-+ return arrow::dataset::CsvFragmentScanOptions::from(configs);
-+#endif
-+ default:
-+ return arrow::Status::Invalid("Illegal file format id: "
,file_format_id);
-+ }
-+}
-+
- class ReserveFromJava : public arrow::dataset::jni::ReservationListener {
- public:
- ReserveFromJava(JavaVM* vm, jobject java_reservation_listener)
-@@ -460,12 +474,13 @@ JNIEXPORT void JNICALL
Java_org_apache_arrow_dataset_jni_JniWrapper_closeDataset
- /*
- * Class: org_apache_arrow_dataset_jni_JniWrapper
- * Method: createScanner
-- * Signature:
(J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JJ)J
-+ * Signature:
-+ *
(J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JJ;Ljava/nio/ByteBuffer;J)J
- */
- JNIEXPORT jlong JNICALL
Java_org_apache_arrow_dataset_jni_JniWrapper_createScanner(
- JNIEnv* env, jobject, jlong dataset_id, jobjectArray columns,
-- jobject substrait_projection, jobject substrait_filter,
-- jlong batch_size, jlong memory_pool_id) {
-+ jobject substrait_projection, jobject substrait_filter, jlong batch_size,
-+ jlong file_format_id, jobject options, jlong memory_pool_id) {
- JNI_METHOD_START
- arrow::MemoryPool* pool =
reinterpret_cast<arrow::MemoryPool*>(memory_pool_id);
- if (pool == nullptr) {
-@@ -514,6 +529,14 @@ JNIEXPORT jlong JNICALL
Java_org_apache_arrow_dataset_jni_JniWrapper_createScann
- }
- JniAssertOkOrThrow(scanner_builder->Filter(*filter_expr));
- }
-+ if (file_format_id != -1 && options != nullptr) {
-+ std::unordered_map<std::string, std::string> option_map;
-+ std::shared_ptr<arrow::Buffer> buffer =
LoadArrowBufferFromByteBuffer(env, options);
-+ JniAssertOkOrThrow(arrow::engine::DeserializeMap(*buffer, option_map));
-+ std::shared_ptr<arrow::dataset::FragmentScanOptions> scan_options =
-+ JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map));
-+ JniAssertOkOrThrow(scanner_builder->FragmentScanOptions(scan_options));
-+ }
- JniAssertOkOrThrow(scanner_builder->BatchSize(batch_size));
-
- auto scanner = JniGetOrThrow(scanner_builder->Finish());
-@@ -627,14 +650,31 @@ JNIEXPORT void JNICALL
Java_org_apache_arrow_dataset_jni_JniWrapper_ensureS3Fina
- /*
- * Class: org_apache_arrow_dataset_file_JniWrapper
- * Method: makeFileSystemDatasetFactory
-- * Signature: (Ljava/lang/String;II)J
-+ * Signature: (Ljava/lang/String;IILjava/lang/String;Ljava/nio/ByteBuffer)J
- */
- JNIEXPORT jlong JNICALL
--Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljava_lang_String_2I(
-- JNIEnv* env, jobject, jstring uri, jint file_format_id) {
-+Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory(
-+ JNIEnv* env, jobject, jstring uri, jint file_format_id, jobject options) {
- JNI_METHOD_START
- std::shared_ptr<arrow::dataset::FileFormat> file_format =
- JniGetOrThrow(GetFileFormat(file_format_id));
-+ if (options != nullptr) {
-+ std::unordered_map<std::string, std::string> option_map;
-+ std::shared_ptr<arrow::Buffer> buffer =
LoadArrowBufferFromByteBuffer(env, options);
-+ JniAssertOkOrThrow(arrow::engine::DeserializeMap(*buffer, option_map));
-+ std::shared_ptr<arrow::dataset::FragmentScanOptions> scan_options =
-+ JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map));
-+ file_format->default_fragment_scan_options = scan_options;
-+#ifdef ARROW_CSV
-+ if (file_format_id == 3) {
-+ std::shared_ptr<arrow::dataset::CsvFileFormat> csv_file_format =
-+
std::dynamic_pointer_cast<arrow::dataset::CsvFileFormat>(file_format);
-+ csv_file_format->parse_options =
-+
std::dynamic_pointer_cast<arrow::dataset::CsvFragmentScanOptions>(scan_options)
-+ ->parse_options;
-+ }
-+#endif
-+ }
- arrow::dataset::FileSystemFactoryOptions options;
- std::shared_ptr<arrow::dataset::DatasetFactory> d =
- JniGetOrThrow(arrow::dataset::FileSystemDatasetFactory::Make(
-@@ -645,16 +685,33 @@
Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljav
-
- /*
- * Class: org_apache_arrow_dataset_file_JniWrapper
-- * Method: makeFileSystemDatasetFactory
-- * Signature: ([Ljava/lang/String;II)J
-+ * Method: makeFileSystemDatasetFactoryWithFiles
-+ * Signature: ([Ljava/lang/String;IIJ;Ljava/nio/ByteBuffer)J
- */
- JNIEXPORT jlong JNICALL
--Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory___3Ljava_lang_String_2I(
-- JNIEnv* env, jobject, jobjectArray uris, jint file_format_id) {
-+Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactoryWithFiles(
-+ JNIEnv* env, jobject, jobjectArray uris, jint file_format_id, jobject
options) {
- JNI_METHOD_START
-
- std::shared_ptr<arrow::dataset::FileFormat> file_format =
- JniGetOrThrow(GetFileFormat(file_format_id));
-+ if (options != nullptr) {
-+ std::unordered_map<std::string, std::string> option_map;
-+ std::shared_ptr<arrow::Buffer> buffer =
LoadArrowBufferFromByteBuffer(env, options);
-+ JniAssertOkOrThrow(arrow::engine::DeserializeMap(*buffer, option_map));
-+ std::shared_ptr<arrow::dataset::FragmentScanOptions> scan_options =
-+ JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map));
-+ file_format->default_fragment_scan_options = scan_options;
-+#ifdef ARROW_CSV
-+ if (file_format_id == 3) {
-+ std::shared_ptr<arrow::dataset::CsvFileFormat> csv_file_format =
-+
std::dynamic_pointer_cast<arrow::dataset::CsvFileFormat>(file_format);
-+ csv_file_format->parse_options =
-+
std::dynamic_pointer_cast<arrow::dataset::CsvFragmentScanOptions>(scan_options)
-+ ->parse_options;
-+ }
-+#endif
-+ }
- arrow::dataset::FileSystemFactoryOptions options;
-
- std::vector<std::string> uri_vec = ToStringVector(env, uris);
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java
-index aa3156905..a0b6fb168 100644
----
a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java
-+++
b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java
-@@ -17,8 +17,11 @@
-
- package org.apache.arrow.dataset.file;
-
-+import java.util.Optional;
-+
- import org.apache.arrow.dataset.jni.NativeDatasetFactory;
- import org.apache.arrow.dataset.jni.NativeMemoryPool;
-+import org.apache.arrow.dataset.scanner.FragmentScanOptions;
- import org.apache.arrow.memory.BufferAllocator;
-
- /**
-@@ -27,21 +30,34 @@ import org.apache.arrow.memory.BufferAllocator;
- public class FileSystemDatasetFactory extends NativeDatasetFactory {
-
- public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool
memoryPool, FileFormat format,
-- String uri) {
-- super(allocator, memoryPool, createNative(format, uri));
-+ String uri, Optional<FragmentScanOptions> fragmentScanOptions) {
-+ super(allocator, memoryPool, createNative(format, uri,
fragmentScanOptions));
-+ }
-+
-+ public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool
memoryPool, FileFormat format,
-+ String uri) {
-+ super(allocator, memoryPool, createNative(format, uri, Optional.empty()));
-+ }
-+
-+ public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool
memoryPool, FileFormat format,
-+ String[] uris,
Optional<FragmentScanOptions> fragmentScanOptions) {
-+ super(allocator, memoryPool, createNative(format, uris,
fragmentScanOptions));
- }
-
- public FileSystemDatasetFactory(BufferAllocator allocator, NativeMemoryPool
memoryPool, FileFormat format,
- String[] uris) {
-- super(allocator, memoryPool, createNative(format, uris));
-+ super(allocator, memoryPool, createNative(format, uris,
Optional.empty()));
- }
-
-- private static long createNative(FileFormat format, String uri) {
-- return JniWrapper.get().makeFileSystemDatasetFactory(uri, format.id());
-+ private static long createNative(FileFormat format, String uri,
Optional<FragmentScanOptions> fragmentScanOptions) {
-+ return JniWrapper.get().makeFileSystemDatasetFactory(uri, format.id(),
-+ fragmentScanOptions.map(FragmentScanOptions::serialize).orElse(null));
- }
-
-- private static long createNative(FileFormat format, String[] uris) {
-- return JniWrapper.get().makeFileSystemDatasetFactory(uris, format.id());
-+ private static long createNative(FileFormat format, String[] uris,
-+ Optional<FragmentScanOptions>
fragmentScanOptions) {
-+ return JniWrapper.get().makeFileSystemDatasetFactoryWithFiles(uris,
format.id(),
-+ fragmentScanOptions.map(FragmentScanOptions::serialize).orElse(null));
- }
-
- }
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java
-index c3a1a4e58..c3f8e12b3 100644
---- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java
-+++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java
-@@ -17,6 +17,8 @@
-
- package org.apache.arrow.dataset.file;
-
-+import java.nio.ByteBuffer;
-+
- import org.apache.arrow.dataset.jni.JniLoader;
-
- /**
-@@ -43,7 +45,8 @@ public class JniWrapper {
- * @return the native pointer of the
arrow::dataset::FileSystemDatasetFactory instance.
- * @see FileFormat
- */
-- public native long makeFileSystemDatasetFactory(String uri, int fileFormat);
-+ public native long makeFileSystemDatasetFactory(String uri, int fileFormat,
-+ ByteBuffer
serializedFragmentScanOptions);
-
- /**
- * Create FileSystemDatasetFactory and return its native pointer. The
pointer is pointing to a
-@@ -54,7 +57,8 @@ public class JniWrapper {
- * @return the native pointer of the
arrow::dataset::FileSystemDatasetFactory instance.
- * @see FileFormat
- */
-- public native long makeFileSystemDatasetFactory(String[] uris, int
fileFormat);
-+ public native long makeFileSystemDatasetFactoryWithFiles(String[] uris, int
fileFormat,
-+ ByteBuffer
serializedFragmentScanOptions);
-
- /**
- * Write the content in a {@link org.apache.arrow.c.ArrowArrayStream} into
files. This internally
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java
-index 637a3e8f2..6d6309140 100644
---- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java
-+++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java
-@@ -80,7 +80,8 @@ public class JniWrapper {
- * @return the native pointer of the arrow::dataset::Scanner instance.
- */
- public native long createScanner(long datasetId, String[] columns,
ByteBuffer substraitProjection,
-- ByteBuffer substraitFilter, long
batchSize, long memoryPool);
-+ ByteBuffer substraitFilter, long
batchSize, long fileFormat,
-+ ByteBuffer serializedFragmentScanOptions,
long memoryPool);
-
- /**
- * Get a serialized schema from native instance of a Scanner.
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java
-index d9abad997..3a96fe768 100644
----
a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java
-+++
b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java
-@@ -17,6 +17,9 @@
-
- package org.apache.arrow.dataset.jni;
-
-+import java.nio.ByteBuffer;
-+
-+import org.apache.arrow.dataset.scanner.FragmentScanOptions;
- import org.apache.arrow.dataset.scanner.ScanOptions;
- import org.apache.arrow.dataset.source.Dataset;
-
-@@ -40,11 +43,18 @@ public class NativeDataset implements Dataset {
- if (closed) {
- throw new NativeInstanceReleasedException();
- }
--
-+ int fileFormat = -1;
-+ ByteBuffer serialized = null;
-+ if (options.getFragmentScanOptions().isPresent()) {
-+ FragmentScanOptions fragmentScanOptions =
options.getFragmentScanOptions().get();
-+ fileFormat = fragmentScanOptions.fileFormatId();
-+ serialized = fragmentScanOptions.serialize();
-+ }
- long scannerId = JniWrapper.get().createScanner(datasetId,
options.getColumns().orElse(null),
- options.getSubstraitProjection().orElse(null),
- options.getSubstraitFilter().orElse(null),
-- options.getBatchSize(),
context.getMemoryPool().getNativeInstanceId());
-+ options.getBatchSize(), fileFormat, serialized,
-+ context.getMemoryPool().getNativeInstanceId());
-
- return new NativeScanner(context, scannerId);
- }
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java
-new file mode 100644
-index 000000000..8acb2b2d4
---- /dev/null
-+++
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java
-@@ -0,0 +1,50 @@
-+/*
-+ * Licensed to the Apache Software Foundation (ASF) under one or more
-+ * contributor license agreements. See the NOTICE file distributed with
-+ * this work for additional information regarding copyright ownership.
-+ * The ASF licenses this file to You under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+
-+package org.apache.arrow.dataset.scanner;
-+
-+import java.nio.ByteBuffer;
-+import java.util.Map;
-+
-+import org.apache.arrow.dataset.substrait.util.ConvertUtil;
-+
-+import io.substrait.proto.Expression;
-+
-+public interface FragmentScanOptions {
-+ String typeName();
-+
-+ int fileFormatId();
-+
-+ ByteBuffer serialize();
-+
-+ /**
-+ * serialize the map.
-+ *
-+ * @param config config map
-+ * @return bufer to jni call argument, should be DirectByteBuffer
-+ */
-+ default ByteBuffer serializeMap(Map<String, String> config) {
-+ if (config.isEmpty()) {
-+ return null;
-+ }
-+
-+ Expression.Literal literal = ConvertUtil.mapToExpressionLiteral(config);
-+ ByteBuffer buf = ByteBuffer.allocateDirect(literal.getSerializedSize());
-+ buf.put(literal.toByteArray());
-+ return buf;
-+ }
-+}
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java
-index 995d05ac3..aad71930c 100644
----
a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java
-+++
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java
-@@ -31,6 +31,8 @@ public class ScanOptions {
- private final Optional<ByteBuffer> substraitProjection;
- private final Optional<ByteBuffer> substraitFilter;
-
-+ private final Optional<FragmentScanOptions> fragmentScanOptions;
-+
- /**
- * Constructor.
- * @param columns Projected columns. Empty for scanning all columns.
-@@ -61,6 +63,7 @@ public class ScanOptions {
- this.columns = columns;
- this.substraitProjection = Optional.empty();
- this.substraitFilter = Optional.empty();
-+ this.fragmentScanOptions = Optional.empty();
- }
-
- public ScanOptions(long batchSize) {
-@@ -83,6 +86,10 @@ public class ScanOptions {
- return substraitFilter;
- }
-
-+ public Optional<FragmentScanOptions> getFragmentScanOptions() {
-+ return fragmentScanOptions;
-+ }
-+
- /**
- * Builder for Options used during scanning.
- */
-@@ -91,6 +98,7 @@ public class ScanOptions {
- private Optional<String[]> columns;
- private ByteBuffer substraitProjection;
- private ByteBuffer substraitFilter;
-+ private FragmentScanOptions fragmentScanOptions;
-
- /**
- * Constructor.
-@@ -136,6 +144,18 @@ public class ScanOptions {
- return this;
- }
-
-+ /**
-+ * Set the FragmentScanOptions.
-+ *
-+ * @param fragmentScanOptions scan options
-+ * @return the ScanOptions configured.
-+ */
-+ public Builder fragmentScanOptions(FragmentScanOptions
fragmentScanOptions) {
-+ Preconditions.checkNotNull(fragmentScanOptions);
-+ this.fragmentScanOptions = fragmentScanOptions;
-+ return this;
-+ }
-+
- public ScanOptions build() {
- return new ScanOptions(this);
- }
-@@ -146,5 +166,6 @@ public class ScanOptions {
- columns = builder.columns;
- substraitProjection = Optional.ofNullable(builder.substraitProjection);
- substraitFilter = Optional.ofNullable(builder.substraitFilter);
-+ fragmentScanOptions = Optional.ofNullable(builder.fragmentScanOptions);
- }
- }
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java
-new file mode 100644
-index 000000000..08e35ede2
---- /dev/null
-+++
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java
-@@ -0,0 +1,51 @@
-+/*
-+ * Licensed to the Apache Software Foundation (ASF) under one or more
-+ * contributor license agreements. See the NOTICE file distributed with
-+ * this work for additional information regarding copyright ownership.
-+ * The ASF licenses this file to You under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+
-+package org.apache.arrow.dataset.scanner.csv;
-+
-+import java.util.Map;
-+import java.util.Optional;
-+
-+import org.apache.arrow.c.ArrowSchema;
-+
-+public class CsvConvertOptions {
-+
-+ private final Map<String, String> configs;
-+
-+ private Optional<ArrowSchema> cSchema = Optional.empty();
-+
-+ public CsvConvertOptions(Map<String, String> configs) {
-+ this.configs = configs;
-+ }
-+
-+ public Optional<ArrowSchema> getArrowSchema() {
-+ return cSchema;
-+ }
-+
-+ public Map<String, String> getConfigs() {
-+ return configs;
-+ }
-+
-+ public void set(String key, String value) {
-+ configs.put(key, value);
-+ }
-+
-+ public void setArrowSchema(ArrowSchema cSchema) {
-+ this.cSchema = Optional.of(cSchema);
-+ }
-+
-+}
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java
-new file mode 100644
-index 000000000..88973f0a0
---- /dev/null
-+++
b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java
-@@ -0,0 +1,97 @@
-+/*
-+ * Licensed to the Apache Software Foundation (ASF) under one or more
-+ * contributor license agreements. See the NOTICE file distributed with
-+ * this work for additional information regarding copyright ownership.
-+ * The ASF licenses this file to You under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+
-+package org.apache.arrow.dataset.scanner.csv;
-+
-+import java.io.Serializable;
-+import java.nio.ByteBuffer;
-+import java.util.Locale;
-+import java.util.Map;
-+import java.util.stream.Collectors;
-+import java.util.stream.Stream;
-+
-+import org.apache.arrow.dataset.file.FileFormat;
-+import org.apache.arrow.dataset.scanner.FragmentScanOptions;
-+
-+public class CsvFragmentScanOptions implements Serializable,
FragmentScanOptions {
-+ private final CsvConvertOptions convertOptions;
-+ private final Map<String, String> readOptions;
-+ private final Map<String, String> parseOptions;
-+
-+
-+ /**
-+ * csv scan options, map to CPP struct CsvFragmentScanOptions.
-+ *
-+ * @param convertOptions same struct in CPP
-+ * @param readOptions same struct in CPP
-+ * @param parseOptions same struct in CPP
-+ */
-+ public CsvFragmentScanOptions(CsvConvertOptions convertOptions,
-+ Map<String, String> readOptions,
-+ Map<String, String> parseOptions) {
-+ this.convertOptions = convertOptions;
-+ this.readOptions = readOptions;
-+ this.parseOptions = parseOptions;
-+ }
-+
-+ public String typeName() {
-+ return FileFormat.CSV.name().toLowerCase(Locale.ROOT);
-+ }
-+
-+ /**
-+ * File format id.
-+ *
-+ * @return id
-+ */
-+ public int fileFormatId() {
-+ return FileFormat.CSV.id();
-+ }
-+
-+ /**
-+ * Serialize this class to ByteBuffer and then called by jni call.
-+ *
-+ * @return DirectByteBuffer
-+ */
-+ public ByteBuffer serialize() {
-+ Map<String, String> options =
Stream.concat(Stream.concat(readOptions.entrySet().stream(),
-+ parseOptions.entrySet().stream()),
-+ convertOptions.getConfigs().entrySet().stream()).collect(
-+ Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
-+
-+ if (convertOptions.getArrowSchema().isPresent()) {
-+ options.put("column_types",
Long.toString(convertOptions.getArrowSchema().get().memoryAddress()));
-+ }
-+ return serializeMap(options);
-+ }
-+
-+ public static CsvFragmentScanOptions deserialize(String serialized) {
-+ throw new UnsupportedOperationException("Not implemented now");
-+ }
-+
-+ public CsvConvertOptions getConvertOptions() {
-+ return convertOptions;
-+ }
-+
-+ public Map<String, String> getReadOptions() {
-+ return readOptions;
-+ }
-+
-+ public Map<String, String> getParseOptions() {
-+ return parseOptions;
-+ }
-+
-+}
-diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java
-new file mode 100644
-index 000000000..31a4023af
---- /dev/null
-+++
b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java
-@@ -0,0 +1,46 @@
-+/*
-+ * Licensed to the Apache Software Foundation (ASF) under one or more
-+ * contributor license agreements. See the NOTICE file distributed with
-+ * this work for additional information regarding copyright ownership.
-+ * The ASF licenses this file to You under the Apache License, Version 2.0
-+ * (the "License"); you may not use this file except in compliance with
-+ * the License. You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+
-+package org.apache.arrow.dataset.substrait.util;
-+
-+import java.util.Map;
-+
-+import io.substrait.proto.Expression;
-+
-+public class ConvertUtil {
-+
-+ /**
-+ * Convert map to substrait Expression.
-+ *
-+ * @return Substrait Expression
-+ */
-+ public static Expression.Literal mapToExpressionLiteral(Map<String, String>
values) {
-+ Expression.Literal.Builder literalBuilder =
Expression.Literal.newBuilder();
-+ Expression.Literal.Map.KeyValue.Builder keyValueBuilder =
-+ Expression.Literal.Map.KeyValue.newBuilder();
-+ Expression.Literal.Map.Builder mapBuilder =
Expression.Literal.Map.newBuilder();
-+ for (Map.Entry<String, String> entry : values.entrySet()) {
-+ literalBuilder.setString(entry.getKey());
-+ keyValueBuilder.setKey(literalBuilder.build());
-+ literalBuilder.setString(entry.getValue());
-+ keyValueBuilder.setValue(literalBuilder.build());
-+ mapBuilder.addKeyValues(keyValueBuilder.build());
-+ }
-+ literalBuilder.setMap(mapBuilder.build());
-+ return literalBuilder.build();
-+ }
-+}
-diff --git
a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java
b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java
-index 0fba72892..e7903b7a4 100644
----
a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java
-+++
b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java
-@@ -31,6 +31,9 @@ import java.util.HashMap;
- import java.util.Map;
- import java.util.Optional;
-
-+import org.apache.arrow.c.ArrowSchema;
-+import org.apache.arrow.c.CDataDictionaryProvider;
-+import org.apache.arrow.c.Data;
- import org.apache.arrow.dataset.ParquetWriteSupport;
- import org.apache.arrow.dataset.TestDataset;
- import org.apache.arrow.dataset.file.FileFormat;
-@@ -38,8 +41,11 @@ import
org.apache.arrow.dataset.file.FileSystemDatasetFactory;
- import org.apache.arrow.dataset.jni.NativeMemoryPool;
- import org.apache.arrow.dataset.scanner.ScanOptions;
- import org.apache.arrow.dataset.scanner.Scanner;
-+import org.apache.arrow.dataset.scanner.csv.CsvConvertOptions;
-+import org.apache.arrow.dataset.scanner.csv.CsvFragmentScanOptions;
- import org.apache.arrow.dataset.source.Dataset;
- import org.apache.arrow.dataset.source.DatasetFactory;
-+import org.apache.arrow.memory.BufferAllocator;
- import org.apache.arrow.vector.ipc.ArrowReader;
- import org.apache.arrow.vector.types.pojo.ArrowType;
- import org.apache.arrow.vector.types.pojo.Field;
-@@ -49,6 +55,8 @@ import org.junit.ClassRule;
- import org.junit.Test;
- import org.junit.rules.TemporaryFolder;
-
-+import com.google.common.collect.ImmutableMap;
-+
- public class TestAceroSubstraitConsumer extends TestDataset {
-
- @ClassRule
-@@ -457,4 +465,42 @@ public class TestAceroSubstraitConsumer extends
TestDataset {
- substraitExpression.put(decodedSubstrait);
- return substraitExpression;
- }
-+
-+ @Test
-+ public void testCsvConvertOptions() throws Exception {
-+ final Schema schema = new Schema(Arrays.asList(
-+ Field.nullable("Id", new ArrowType.Int(32, true)),
-+ Field.nullable("Name", new ArrowType.Utf8()),
-+ Field.nullable("Language", new ArrowType.Utf8())
-+ ), null);
-+ String path = "file://" + getClass().getResource("/").getPath() +
"/data/student.csv";
-+ BufferAllocator allocator = rootAllocator();
-+ try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator);
-+ CDataDictionaryProvider provider = new CDataDictionaryProvider()) {
-+ Data.exportSchema(allocator, schema, provider, cSchema);
-+ CsvConvertOptions convertOptions = new
CsvConvertOptions(ImmutableMap.of("delimiter", ";"));
-+ convertOptions.setArrowSchema(cSchema);
-+ CsvFragmentScanOptions fragmentScanOptions = new CsvFragmentScanOptions(
-+ convertOptions, ImmutableMap.of(), ImmutableMap.of());
-+ ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768)
-+ .columns(Optional.empty())
-+ .fragmentScanOptions(fragmentScanOptions)
-+ .build();
-+ try (
-+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
-+ FileFormat.CSV, path);
-+ Dataset dataset = datasetFactory.finish();
-+ Scanner scanner = dataset.newScan(options);
-+ ArrowReader reader = scanner.scanBatches()
-+ ) {
-+ assertEquals(schema.getFields(),
reader.getVectorSchemaRoot().getSchema().getFields());
-+ int rowCount = 0;
-+ while (reader.loadNextBatch()) {
-+ assertEquals("[1, 2, 3]",
reader.getVectorSchemaRoot().getVector("Id").toString());
-+ rowCount += reader.getVectorSchemaRoot().getRowCount();
-+ }
-+ assertEquals(3, rowCount);
-+ }
-+ }
-+ }
- }
-diff --git a/java/dataset/src/test/resources/data/student.csv
b/java/dataset/src/test/resources/data/student.csv
-new file mode 100644
-index 000000000..329194609
---- /dev/null
-+++ b/java/dataset/src/test/resources/data/student.csv
-@@ -0,0 +1,4 @@
-+Id;Name;Language
-+1;Juno;Java
-+2;Peter;Python
-+3;Celin;C++
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]