davisusanibar commented on PR #34866:
URL: https://github.com/apache/arrow/pull/34866#issuecomment-1497280576
> Sure!
>
> I think that we should fix this in C++ side no Java side.
>
> Could you revert the current change and apply the following patch?
>
> ```diff
> diff --git a/cpp/src/arrow/dataset/CMakeLists.txt
b/cpp/src/arrow/dataset/CMakeLists.txt
> index e1b14b77c4..bdc65573b4 100644
> --- a/cpp/src/arrow/dataset/CMakeLists.txt
> +++ b/cpp/src/arrow/dataset/CMakeLists.txt
> @@ -25,6 +25,7 @@ set(ARROW_DATASET_SRCS
> discovery.cc
> file_base.cc
> file_ipc.cc
> + file_parquet.cc
> partition.cc
> plan.cc
> projector.cc
> @@ -39,39 +40,26 @@ endif()
>
> set(ARROW_DATASET_STATIC_LINK_LIBS)
> set(ARROW_DATASET_SHARED_LINK_LIBS)
> -set(ARROW_DATASET_STATIC_INSTALL_INTERFACE_LIBS)
> -set(ARROW_DATASET_SHARED_INSTALL_INTERFACE_LIBS)
> +set(ARROW_DATASET_STATIC_INSTALL_INTERFACE_LIBS
ArrowAcero::arrow_acero_static)
> +set(ARROW_DATASET_SHARED_INSTALL_INTERFACE_LIBS
ArrowAcero::arrow_acero_shared)
>
> if(ARROW_CSV)
> - set(ARROW_DATASET_SRCS ${ARROW_DATASET_SRCS} file_csv.cc)
> + list(APPEND ARROW_DATASET_SRCS file_csv.cc)
> endif()
>
> if(ARROW_JSON)
> - set(ARROW_DATASET_SRCS ${ARROW_DATASET_SRCS} file_json.cc)
> + list(APPEND ARROW_DATASET_SRCS file_json.cc)
> endif()
>
> if(ARROW_ORC)
> - set(ARROW_DATASET_SRCS ${ARROW_DATASET_SRCS} file_orc.cc)
> -endif()
> -
> -if(ARROW_PARQUET)
> - list(APPEND ARROW_DATASET_STATIC_LINK_LIBS parquet_static)
> - list(APPEND ARROW_DATASET_SHARED_LINK_LIBS parquet_shared)
> - list(APPEND ARROW_DATASET_STATIC_INSTALL_INTERFACE_LIBS
Parquet::parquet_static)
> - list(APPEND ARROW_DATASET_SHARED_INSTALL_INTERFACE_LIBS
Parquet::parquet_shared)
> - list(APPEND ARROW_DATASET_SRCS file_parquet.cc)
> - list(APPEND ARROW_DATASET_PRIVATE_INCLUDES
${PROJECT_SOURCE_DIR}/src/parquet)
> -else()
> - list(APPEND ARROW_DATASET_STATIC_INSTALL_INTERFACE_LIBS
Arrow::arrow_static)
> - list(APPEND ARROW_DATASET_SHARED_INSTALL_INTERFACE_LIBS
Arrow::arrow_shared)
> + list(APPEND ARROW_DATASET_SRCS file_orc.cc)
> endif()
>
> list(APPEND
> ARROW_DATASET_STATIC_LINK_LIBS
> - arrow_static
> arrow_acero_static
> ${ARROW_STATIC_LINK_LIBS})
> -list(APPEND ARROW_DATASET_SHARED_LINK_LIBS arrow_shared
arrow_acero_shared)
> +list(APPEND ARROW_DATASET_SHARED_LINK_LIBS arrow_acero_shared)
>
> add_arrow_lib(arrow_dataset
> CMAKE_PACKAGE_NAME
> ```
@kou I just applied that change on C++, but error persist on JNI Java
Dataset Module.
It will be because for example if Java JNI Dataset needs
`arrow::dataset::FileSystemDataset::Write` and then it calls
`"arrow::acero::Declaration::Sequence`, then will be needed to add
`ArrowDataset::arrow_dataset_static` and `ArrowAcero::arrow_acero_static` to
Java Dataset CMakeLists .
Error message:
```
+ cmake --build . --config release
[1/1] Linking CXX shared library dataset/libarrow_dataset_jni.dylib
FAILED: dataset/libarrow_dataset_jni.dylib
: &&
/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++
-O3 -DNDEBUG -isysroot
/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.3.sdk
-mmacosx-version-min=13.2 -dynamiclib -Wl,-headerpad_max_install_names -o
dataset/libarrow_dataset_jni.dylib -install_name
@rpath/libarrow_dataset_jni.dylib
dataset/CMakeFiles/arrow_java_jni_dataset.dir/src/main/cpp/jni_wrapper.cc.o
dataset/CMakeFiles/arrow_java_jni_dataset.dir/src/main/cpp/jni_util.cc.o
/Users/dsusanibar/voltron/jiraarrow/main3/arrow/cpp-build/cpp-install/lib/libarrow_dataset.a
/Users/dsusanibar/voltron/jiraarrow/main3/arrow/cpp-build/cpp-install/lib/libparquet.a
/Users/dsusanibar/voltron/jiraarrow/main3/arrow/cpp-build/cpp-install/lib/libarrow.a
/Users/dsusanibar/voltron/jiraarrow/main3/arrow/cpp-build/cpp-install/lib/libarrow_bundled_dependencies.a
-Xlinker -framework -Xlinker CoreFoundation -Xlinker -framework -Xlinker
Security /usr/local
/opt/[email protected]/lib/libssl.a /usr/local/opt/[email protected]/lib/libcrypto.a
/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.3.sdk/usr/lib/libz.tbd
/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.3.sdk/usr/lib/libcurl.tbd
/usr/local/Cellar/thrift/0.18.1/lib/libthrift.a && :
Undefined symbols for architecture x86_64:
"arrow::acero::Declaration::Sequence(std::__1::vector<arrow::acero::Declaration,
std::__1::allocator<arrow::acero::Declaration> >)", referenced from:
arrow::dataset::FileSystemDataset::Write(arrow::dataset::FileSystemDatasetWriteOptions
const&, std::__1::shared_ptr<arrow::dataset::Scanner>) in
libarrow_dataset.a(unity_0_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool) in libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::AsyncScanner::CountRowsAsync(arrow::internal::Executor*) in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::QueryContext::ScheduleTask(std::__1::function<arrow::Status
()>, std::__1::basic_string_view<char, std::__1::char_traits<char> >)",
referenced from:
arrow::internal::FnOnce<void (arrow::FutureImpl
const&)>::FnImpl<arrow::Future<std::__1::shared_ptr<arrow::RecordBatch>
>::WrapResultyOnComplete::Callback<arrow::Future<std::__1::shared_ptr<arrow::RecordBatch>
>::ThenOnComplete<arrow::dataset::(anonymous
namespace)::ScanNode::ScanBatchTask::operator()()::'lambda'(std::__1::shared_ptr<arrow::RecordBatch>
const&), arrow::Future<std::__1::shared_ptr<arrow::RecordBatch>
>::PassthruOnFailure<arrow::dataset::(anonymous
namespace)::ScanNode::ScanBatchTask::operator()()::'lambda'(std::__1::shared_ptr<arrow::RecordBatch>
const&)> > > >::invoke(arrow::FutureImpl const&) in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::DeclarationToStatus(arrow::acero::Declaration, bool,
arrow::MemoryPool*, arrow::compute::FunctionRegistry*)", referenced from:
arrow::dataset::FileSystemDataset::Write(arrow::dataset::FileSystemDatasetWriteOptions
const&, std::__1::shared_ptr<arrow::dataset::Scanner>) in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::ValidateExecNodeInputs(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> > const&, int, char const*)",
referenced from:
arrow::dataset::(anonymous
namespace)::TeeNode::Make(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_0_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::ScanNode::Make(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::DeclarationToBatchesAsync(arrow::acero::Declaration,
arrow::compute::ExecContext)", referenced from:
arrow::dataset::(anonymous
namespace)::AsyncScanner::CountRowsAsync(arrow::internal::Executor*) in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::default_exec_factory_registry()", referenced from:
arrow::dataset::MakeWriteNode(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_0_cxx.cxx.o)
void
std::__1::__call_once_proxy[abi:v15006]<std::__1::tuple<arrow::dataset::internal::Initialize()::$_14&&>
>(void*) in libarrow_dataset.a(unity_0_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::MakeScanNode(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::MakeOrderedSinkNode(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::MakeAugmentedProjectNode(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::MapNode::InputFinished(arrow::acero::ExecNode*, int)",
referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::MapNode::InputReceived(arrow::acero::ExecNode*,
arrow::compute::ExecBatch)", referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::MapNode::PauseProducing(arrow::acero::ExecNode*, int)",
referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::MapNode::StartProducing()", referenced from:
arrow::dataset::(anonymous namespace)::TeeNode::StartProducing() in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::MapNode::ResumeProducing(arrow::acero::ExecNode*, int)",
referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::MapNode::StopProducingImpl()", referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::MapNode::Finish()", referenced from:
std::__1::__function::__func<arrow::dataset::(anonymous
namespace)::TeeNode::StartProducing()::'lambda1'(),
std::__1::allocator<arrow::dataset::(anonymous
namespace)::TeeNode::StartProducing()::'lambda1'()>, void ()>::operator()() in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::MapNode::MapNode(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >,
std::__1::shared_ptr<arrow::Schema>)", referenced from:
arrow::dataset::(anonymous
namespace)::TeeNode::Make(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::ExecNode::Init()", referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::ExecNode::ExecNode(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >,
std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>,
std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char,
std::__1::char_traits<char>, std::__1::allocator<char> > > >,
std::__1::shared_ptr<arrow::Schema>)", referenced from:
arrow::dataset::(anonymous
namespace)::ScanNode::Make(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecPlan::StopProducing()", referenced from:
arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool)::$_16::operator()(...) const in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecPlan::query_context()", referenced from:
arrow::dataset::(anonymous namespace)::TeeNode::StartProducing() in
libarrow_dataset.a(unity_0_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::DatasetWritingSinkNodeConsumer::Init(std::__1::shared_ptr<arrow::Schema>
const&, arrow::acero::BackpressureControl*, arrow::acero::ExecPlan*) in
libarrow_dataset.a(unity_0_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::ScanNode::Make(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::dataset::(anonymous namespace)::ScanNode::StartProducing() in
libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::internal::FnOnce<void (arrow::FutureImpl
const&)>::FnImpl<arrow::Future<std::__1::function<arrow::Future<std::__1::shared_ptr<arrow::dataset::Fragment>
> ()>
>::WrapResultyOnComplete::Callback<arrow::Future<std::__1::function<arrow::Future<std::__1::shared_ptr<arrow::dataset::Fragment>
> ()> >::ThenOnComplete<arrow::dataset::(anonymous
namespace)::ScanNode::StartProducing()::'lambda'()::operator()()
const::'lambda'(std::__1::function<arrow::Future<std::__1::shared_ptr<arrow::dataset::Fragment>
> ()> const&),
arrow::Future<std::__1::function<arrow::Future<std::__1::shared_ptr<arrow::dataset::Fragment>
> ()> >::PassthruOnFailure<arrow::dataset::(anonymous
namespace)::ScanNode::StartProducing()::'lambda'()::operator()()
const::'lambda'(std::__1::function<arrow::Future<std::__1::shared_ptr<arrow::dataset::Fragment>
> ()> const&)> > > >::invoke(arrow::FutureImpl const&) in
libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::ScanNode::ListFragmentTask::operator()() in
libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::internal::FnOnce<void (arrow::FutureImpl
const&)>::FnImpl<arrow::Future<std::__1::shared_ptr<arrow::dataset::InspectedFragment>
>::WrapResultyOnComplete::Callback<arrow::Future<std::__1::shared_ptr<arrow::dataset::InspectedFragment>
>::ThenOnComplete<arrow::dataset::(anonymous
namespace)::ScanNode::ListFragmentTask::operator()()::'lambda'(std::__1::shared_ptr<arrow::dataset::InspectedFragment>
const&), arrow::Future<std::__1::shared_ptr<arrow::dataset::InspectedFragment>
>::PassthruOnFailure<arrow::dataset::(anonymous
namespace)::ScanNode::ListFragmentTask::operator()()::'lambda'(std::__1::shared_ptr<arrow::dataset::InspectedFragment>
const&)> > > >::invoke(arrow::FutureImpl const&) in
libarrow_dataset.a(unity_1_cxx.cxx.o)
...
"arrow::acero::ExecPlan::StartProducing()", referenced from:
arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecPlan::Make(arrow::acero::QueryOptions,
arrow::compute::ExecContext, std::__1::shared_ptr<arrow::KeyValueMetadata
const>)", referenced from:
arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecPlan::Make(arrow::compute::ExecContext,
std::__1::shared_ptr<arrow::KeyValueMetadata const>)", referenced from:
arrow::dataset::(anonymous
namespace)::AsyncScanner::CountRowsAsync(arrow::internal::Executor*) in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecPlan::AddNode(std::__1::unique_ptr<arrow::acero::ExecNode,
std::__1::default_delete<arrow::acero::ExecNode> >)", referenced from:
arrow::dataset::(anonymous
namespace)::TeeNode::Make(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_0_cxx.cxx.o)
arrow::dataset::(anonymous
namespace)::ScanNode::Make(arrow::acero::ExecPlan*,
std::__1::vector<arrow::acero::ExecNode*,
std::__1::allocator<arrow::acero::ExecNode*> >, arrow::acero::ExecNodeOptions
const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecPlan::finished()", referenced from:
arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool)::$_16::operator()(...) const in
libarrow_dataset.a(unity_1_cxx.cxx.o)
arrow::internal::FnOnce<void (arrow::FutureImpl
const&)>::FnImpl<arrow::Future<arrow::dataset::EnumeratedRecordBatch>::WrapResultyOnComplete::Callback<arrow::Future<arrow::dataset::EnumeratedRecordBatch>::ThenOnComplete<arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool)::$_18::operator()()
const::'lambda'(arrow::dataset::EnumeratedRecordBatch const&),
arrow::Future<arrow::dataset::EnumeratedRecordBatch>::PassthruOnFailure<arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool)::$_18::operator()()
const::'lambda'(arrow::dataset::EnumeratedRecordBatch const&)> > >
>::invoke(arrow::FutureImpl const&) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::TracedNode::NoteStartProducing(std::__1::basic_string<char,
std::__1::char_traits<char>, std::__1::allocator<char> >) const", referenced
from:
arrow::dataset::(anonymous namespace)::ScanNode::StartProducing() in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::Declaration::AddToPlan(arrow::acero::ExecPlan*,
arrow::acero::ExecFactoryRegistry*) const", referenced from:
arrow::dataset::(anonymous
namespace)::AsyncScanner::ScanBatchesUnorderedAsync(arrow::internal::Executor*,
bool, bool) in libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::MapNode::ordering() const", referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"arrow::acero::ExecNode::ToStringExtra(int) const", referenced from:
vtable for arrow::dataset::(anonymous namespace)::ScanNode in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecNode::Validate() const", referenced from:
vtable for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
vtable for arrow::dataset::(anonymous namespace)::ScanNode in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"arrow::acero::ExecNode::ordering() const", referenced from:
vtable for arrow::dataset::(anonymous namespace)::ScanNode in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"typeinfo for arrow::acero::MapNode", referenced from:
typeinfo for arrow::dataset::(anonymous namespace)::TeeNode in
libarrow_dataset.a(unity_0_cxx.cxx.o)
"typeinfo for arrow::acero::ExecNode", referenced from:
typeinfo for arrow::dataset::(anonymous namespace)::ScanNode in
libarrow_dataset.a(unity_1_cxx.cxx.o)
"vtable for arrow::acero::ExecNode", referenced from:
arrow::acero::ExecNode::~ExecNode() in
libarrow_dataset.a(unity_0_cxx.cxx.o)
arrow::acero::ExecNode::~ExecNode() in
libarrow_dataset.a(unity_1_cxx.cxx.o)
NOTE: a missing vtable usually means the first non-inline virtual member
function has no definition.
ld: symbol(s) not found for architecture x86_64
clang: error: linker command failed with exit code 1 (use -v to see
invocation)
ninja: build stopped: subcommand failed.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]