This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c6ef0fe73c GH-46411: [C++] Implemented dataset option in Meson (#47669)
c6ef0fe73c is described below
commit c6ef0fe73cc716d7949e06ca7ba4dfd0931bf10e
Author: William Ayd <[email protected]>
AuthorDate: Sat Oct 11 10:07:59 2025 -0400
GH-46411: [C++] Implemented dataset option in Meson (#47669)
### Rationale for this change
Adds more features to the Meson configuration
### What changes are included in this PR?
Implemented the dataset option
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
* GitHub Issue: #46411
Authored-by: Will Ayd <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/meson.build | 16 +++-
cpp/meson.options | 5 +
cpp/src/arrow/acero/meson.build | 24 +++--
cpp/src/arrow/dataset/meson.build | 193 ++++++++++++++++++++++++++++++++++++++
4 files changed, 226 insertions(+), 12 deletions(-)
diff --git a/cpp/meson.build b/cpp/meson.build
index 81143ed1e2..b4fab26676 100644
--- a/cpp/meson.build
+++ b/cpp/meson.build
@@ -52,16 +52,19 @@ if git_description == '' and not meson.is_subproject()
endif
needs_benchmarks = get_option('benchmarks').enabled()
-needs_compute = get_option('compute').enabled()
needs_csv = get_option('csv').enabled()
+needs_dataset = get_option('dataset').enabled()
needs_azure = get_option('azure').enabled()
needs_gcs = get_option('gcs').enabled()
needs_hdfs = get_option('hdfs').enabled()
+needs_opentelemetry = false
+needs_orc = false
needs_parquet = get_option('parquet').enabled()
needs_parquet_encryption = get_option('parquet_require_encryption').enabled()
needs_s3 = get_option('s3').enabled()
needs_filesystem = (get_option('filesystem').enabled()
or needs_azure
+ or needs_dataset
or needs_gcs
or needs_hdfs
or needs_parquet_encryption
@@ -69,7 +72,8 @@ needs_filesystem = (get_option('filesystem').enabled()
)
needs_integration = get_option('integration').enabled()
needs_tests = get_option('tests').enabled()
-needs_acero = get_option('acero').enabled()
+needs_acero = get_option('acero').enabled() or needs_dataset
+needs_compute = get_option('compute').enabled() or needs_acero
needs_flight = get_option('flight').enabled()
needs_ipc = (get_option('ipc').enabled()
or needs_tests
@@ -112,3 +116,11 @@ if needs_parquet
subdir('examples/parquet')
endif
endif
+
+if needs_dataset
+ # Unlike the CMake configuration we need to add dataset support in the top
level
+ # because it potentially requires parquet, which in turn requires arrow.
+ # When included in the subdir('src/arrow') call with parquet enabled, you
end up
+ # with a circular dependency
+ subdir('src/arrow/dataset')
+endif
diff --git a/cpp/meson.options b/cpp/meson.options
index 668f440ee7..4082801803 100644
--- a/cpp/meson.options
+++ b/cpp/meson.options
@@ -39,6 +39,11 @@ option(
description: 'Build all Arrow Compute kernels',
)
option('csv', type: 'feature', description: 'Build the Arrow CSV Parser
Module')
+option(
+ 'dataset',
+ type: 'feature',
+ description: 'Build the Arrow Dataset Modules',
+)
option(
'filesystem',
type: 'feature',
diff --git a/cpp/src/arrow/acero/meson.build b/cpp/src/arrow/acero/meson.build
index c7a8bdb4ca..9f34377024 100644
--- a/cpp/src/arrow/acero/meson.build
+++ b/cpp/src/arrow/acero/meson.build
@@ -90,7 +90,16 @@ arrow_acero_dep = declare_dependency(
)
meson.override_dependency('arrow-acero', arrow_acero_dep)
-arrow_acero_testing_sources = ['test_nodes.cc', 'test_util_internal.cc']
+arrow_acero_test_sources = ['test_nodes.cc', 'test_util_internal.cc']
+arrow_acero_test_lib = static_library(
+ 'arrow-acero-testing',
+ sources: arrow_acero_test_sources,
+ dependencies: [arrow_acero_dep, arrow_compute_test_dep],
+)
+arrow_acero_test_dep = declare_dependency(
+ link_with: [arrow_acero_test_lib],
+ dependencies: [arrow_acero_dep, arrow_compute_test_dep],
+)
arrow_acero_tests = {
'plan-test': {'sources': ['plan_test.cc', 'test_nodes_test.cc']},
@@ -114,8 +123,8 @@ arrow_acero_tests = {
foreach key, val : arrow_acero_tests
exc = executable(
'arrow-acero-@0@'.format(key),
- sources: val['sources'] + arrow_acero_testing_sources,
- dependencies: [arrow_acero_dep, arrow_compute_test_dep],
+ sources: val['sources'],
+ dependencies: [arrow_acero_test_dep],
)
test(key, exc)
endforeach
@@ -137,13 +146,8 @@ arrow_acero_benchmarks = {
foreach key, val : arrow_acero_benchmarks
exc = executable(
key,
- sources: val['sources'] + arrow_acero_testing_sources,
- dependencies: [
- arrow_acero_dep,
- arrow_compute_test_dep,
- arrow_benchmark_dep,
- gmock_dep,
- ],
+ sources: val['sources'],
+ dependencies: [arrow_acero_test_dep, arrow_benchmark_dep, gmock_dep],
)
benchmark(key, exc)
endforeach
diff --git a/cpp/src/arrow/dataset/meson.build
b/cpp/src/arrow/dataset/meson.build
new file mode 100644
index 0000000000..409ce1de2b
--- /dev/null
+++ b/cpp/src/arrow/dataset/meson.build
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+install_headers(
+ [
+ 'api.h',
+ 'dataset.h',
+ 'dataset_writer.h',
+ 'discovery.h',
+ 'file_base.h',
+ 'file_csv.h',
+ 'file_ipc.h',
+ 'file_json.h',
+ 'file_orc.h',
+ 'file_parquet.h',
+ 'parquet_encryption_config.h',
+ 'partition.h',
+ 'plan.h',
+ 'projector.h',
+ 'scanner.h',
+ 'type_fwd.h',
+ 'visibility.h',
+ ],
+ subdir: 'arrow/dataset',
+)
+
+arrow_dataset_srcs = files(
+ 'dataset.cc',
+ 'dataset_writer.cc',
+ 'discovery.cc',
+ 'file_base.cc',
+ 'file_ipc.cc',
+ 'partition.cc',
+ 'plan.cc',
+ 'projector.cc',
+ 'scan_node.cc',
+ 'scanner.cc',
+)
+
+arrow_dataset_deps = [arrow_acero_dep, arrow_compute_dep]
+arrow_pkgconfig_requires = ['arrow-acero', 'arrow-compute']
+if needs_csv
+ arrow_dataset_srcs += ['file_csv.cc']
+endif
+
+if needs_json
+ arrow_dataset_srcs += ['file_json.cc']
+endif
+
+if needs_orc
+ arrow_dataset_srcs += ['file_orc.cc']
+endif
+
+if needs_parquet
+ arrow_dataset_srcs += ['file_parquet.cc']
+ arrow_dataset_deps += [parquet_dep]
+ arrow_pkgconfig_requires += ['parquet']
+endif
+
+if needs_opentelemetry
+ arrow_dataset_deps += [opentelemetry_dep]
+endif
+
+arrow_dataset_lib = library(
+ 'arrow_dataset',
+ sources: arrow_dataset_srcs,
+ dependencies: arrow_dataset_deps,
+ cpp_static_args: ['-DARROW_DS_STATIC'],
+ cpp_shared_args: ['-DARROW_DS_EXPORTING'],
+ gnu_symbol_visibility: 'inlineshidden',
+)
+
+arrow_dataset_args = []
+if get_option('default_library') == 'static'
+ arrow_dataset_args += ['-DARROW_DS_STATIC']
+endif
+
+arrow_dataset_dep = declare_dependency(
+ link_with: [arrow_dataset_lib],
+ dependencies: arrow_dataset_deps,
+ compile_args: arrow_dataset_args,
+)
+meson.override_dependency('arrow-dataset', arrow_dataset_dep)
+
+pkg_config_cflags = get_option('default_library') == 'static' ?
'-DARROW_DS_STATIC' : ''
+pkg_config_cflags_private = get_option('default_library') != 'static' ?
'-DARROW_DS_STATIC' : ''
+pkg.generate(
+ arrow_dataset_lib,
+ filebase: 'arrow-dataset',
+ name: 'Apache Arrow Dataset',
+ description: 'Apache Arrow Dataset provides an API to read and write
semantic datasets stored in different locations and formats.',
+ extra_cflags: [pkg_config_cflags],
+ requires: arrow_pkgconfig_requires,
+ variables: {'Cflags.private': pkg_config_cflags_private},
+)
+
+if needs_testing
+ arrow_dataset_testing_lib = static_library(
+ 'arrow_dataset_testing',
+ sources: ['test_util_internal.cc'],
+ dependencies: [arrow_dataset_dep, arrow_acero_test_dep],
+ )
+ arrow_dataset_test_dep = declare_dependency(
+ link_with: [arrow_dataset_testing_lib],
+ dependencies: [arrow_dataset_dep, arrow_acero_test_dep],
+ )
+else
+ arrow_dataset_test_dep = disabler()
+endif
+
+dataset_tests = {
+ 'dataset': {'sources': ['dataset_test.cc']},
+ 'dataset_writer': {'sources': ['dataset_writer_test.cc']},
+ 'discovery': {'sources': ['discovery_test.cc']},
+ 'file_ipc': {'sources': ['file_ipc_test.cc']},
+ 'file': {'sources': ['file_test.cc']},
+ 'partition': {'sources': ['partition_test.cc']},
+ 'scanner': {'sources': ['scanner_test.cc']},
+ 'subtree': {'sources': ['subtree_test.cc']},
+ 'write_node': {'sources': ['write_node_test.cc']},
+}
+
+if needs_csv
+ dataset_tests += {'file_csv': {'sources': ['file_csv_test.cc']}}
+endif
+
+if needs_json
+ dataset_tests += {
+ 'file_json': {
+ 'sources': ['file_json_test.cc'],
+ 'dependencies': [rapidjson_dep],
+ },
+ }
+endif
+
+if needs_orc
+ dataset_tests += {
+ 'file_orc': {'sources': ['file_orc_test.cc'], 'dependencies':
[orc_dep]},
+ }
+endif
+
+if needs_parquet
+ dataset_tests += {'file_parquet': {'sources': ['file_parquet_test.cc']}}
+ if needs_parquet_encryption
+ dataset_tests += {
+ 'file_parquet_encryption': {
+ 'sources': [
+ 'file_parquet_encryption_test.cc',
+ meson.project_source_root() /
'src/parquet/encryption/test_in_memory_kms.cc',
+ ],
+ },
+ }
+ endif
+endif
+
+foreach key, value : dataset_tests
+ test_name = 'arrow-dataset-@0@'.format(key.replace('_', '-'))
+ exc = executable(
+ test_name,
+ sources: value['sources'],
+ dependencies: [arrow_dataset_test_dep, val.get('dependencies', [])],
+ )
+ test(test_name, exc)
+endforeach
+
+dataset_benchmarks = ['file', 'scanner']
+foreach benchmark : dataset_benchmarks
+ benchmark_name = f'arrow-dataset-@benchmark@-benchmark'
+ exc = executable(
+ benchmark_name,
+ sources: [f'@benchmark@_test.cc'],
+ dependencies: [
+ arrow_dataset_dep,
+ arrow_benchmark_dep,
+ arrow_acero_test_dep,
+ ],
+ )
+ benchmark(benchmark_name, exc)
+endforeach