kou commented on a change in pull request #10913:
URL: https://github.com/apache/arrow/pull/10913#discussion_r704756414
##########
File path: .github/workflows/cpp.yml
##########
@@ -60,11 +60,15 @@ jobs:
image:
- conda-cpp
- ubuntu-cpp-sanitizer
+ - ubuntu-cpp-skyhook
include:
- image: conda-cpp
title: AMD64 Conda C++
- image: ubuntu-cpp-sanitizer
title: AMD64 Ubuntu 20.04 C++ ASAN UBSAN
+ - image: ubuntu-cpp-skyhook
+ ubuntu: 20.04
Review comment:
Do we need this?
##########
File path: cpp/src/skyhook/CMakeLists.txt
##########
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitationsn
+# under the License.
+
+#
+# arrow_skyhook
+#
+# define project properties
+project(arrow_skyhook)
+
+# add the client subdirectory
+add_subdirectory(client)
+
+# define the targets to build
+add_custom_target(arrow_skyhook_client)
+add_custom_target(cls_skyhook)
+
+# define the dependencies
+find_package(librados REQUIRED)
+include_directories(${LIBRADOS_INCLUDE_DIR})
Review comment:
Could you use `librados::rados` target?
##########
File path: ci/docker/ubuntu-20.04-cpp.dockerfile
##########
@@ -80,6 +80,9 @@ RUN apt-get update -y -q && \
libprotoc-dev \
libre2-dev \
libsnappy-dev \
+ libradospp-dev \
+ rados-objclass-dev \
+ python3-rados \
Review comment:
Could you keep this list in alphabetical order?
##########
File path: cpp/src/skyhook/CMakeLists.txt
##########
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitationsn
+# under the License.
+
+#
+# arrow_skyhook
+#
+# define project properties
+project(arrow_skyhook)
Review comment:
We don't need to add this because we have `project()` in
`cpp/CMakeLists.txt`.
##########
File path: ci/scripts/cpp_build.sh
##########
@@ -64,6 +64,7 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \
-DARROW_CUDA=${ARROW_CUDA:-OFF} \
-DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \
-DARROW_DATASET=${ARROW_DATASET:-ON} \
+ -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \
Review comment:
Could you keep this list in alphabetical order?
##########
File path: ci/docker/ubuntu-20.04-cpp.dockerfile
##########
@@ -96,7 +99,10 @@ RUN apt-get update -y -q && \
COPY ci/scripts/install_minio.sh \
/arrow/ci/scripts/
+COPY ci/scripts/install_ceph.sh \
+ /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local
+RUN /arrow/ci/scripts/install_ceph.sh
Review comment:
We can use Docker's build cache as much as possible by grouping `COPY`
and `RUN`:
```suggestion
COPY ci/scripts/install_ceph.sh \
/arrow/ci/scripts/
RUN /arrow/ci/scripts/install_ceph.sh
COPY ci/scripts/install_minio.sh \
/arrow/ci/scripts/
RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local
```
If we use `COPY ...\nCOPY ...\nRUN ...\nRUN ...\n`, we always need to re-run
both `RUN`s when we change `install_minio.sh` or `install_ceph.sh`.
##########
File path: cpp/src/skyhook/CMakeLists.txt
##########
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitationsn
+# under the License.
+
+#
+# arrow_skyhook
+#
+# define project properties
+project(arrow_skyhook)
+
+# add the client subdirectory
+add_subdirectory(client)
+
+# define the targets to build
+add_custom_target(arrow_skyhook_client)
+add_custom_target(cls_skyhook)
+
+# define the dependencies
+find_package(librados REQUIRED)
+include_directories(${LIBRADOS_INCLUDE_DIR})
+set(ARROW_SKYHOOK_LINK_STATIC arrow_dataset_static)
+set(ARROW_SKYHOOK_LINK_SHARED arrow_dataset_shared)
+set(ARROW_SKYHOOK_LINK_STATIC ${ARROW_SKYHOOK_LINK_STATIC} ${LIBRADOS_LIBRARY})
+set(ARROW_SKYHOOK_LINK_SHARED ${ARROW_SKYHOOK_LINK_SHARED} ${LIBRADOS_LIBRARY})
+
+# define the client and cls sources
+set(ARROW_SKYHOOK_CLIENT_SOURCES client/file_skyhook.cc
protocol/rados_protocol.cc
+ protocol/skyhook_protocol.cc)
+set(ARROW_SKYHOOK_CLS_SOURCES cls/cls_skyhook.cc protocol/rados_protocol.cc
+ protocol/skyhook_protocol.cc)
+
+# define the client library
+add_arrow_lib(arrow_skyhook_client
+ SOURCES
+ ${ARROW_SKYHOOK_CLIENT_SOURCES}
+ OUTPUTS
+ ARROW_SKYHOOK_CLIENT_LIBRARIES
+ SHARED_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_SHARED}
+ STATIC_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_STATIC})
+
+ # define the cls library
Review comment:
```suggestion
# define the cls library
```
##########
File path: docker-compose.yml
##########
@@ -508,6 +509,38 @@ services:
volumes: *ubuntu-volumes
command: *c-glib-command
+ ubuntu-cpp-skyhook:
+ # Usage:
+ # docker-compose build ubuntu-cpp-skyhook
+ # docker-compose run --rm ubuntu-cpp-skyhook
+ # Parameters:
+ # ARCH: amd64, arm64v8, ...
+ # UBUNTU: 16.04, 18.04, 20.04
Review comment:
```suggestion
# UBUNTU: 18.04, 20.04
```
##########
File path: cpp/src/skyhook/cls/cls_skyhook.cc
##########
@@ -0,0 +1,273 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include <rados/objclass.h>
+
+#include <memory>
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/file_ipc.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/result.h"
+#include "arrow/util/compression.h"
+#include "skyhook/protocol/skyhook_protocol.h"
+
+CLS_VER(1, 0)
+CLS_NAME(skyhook)
+
+cls_handle_t h_class;
+cls_method_handle_t h_scan_op;
+
+/// \brief Log skyhook errors using RADOS object class SDK's logger.
+void LogSkyhookError(const std::string& msg) { CLS_LOG(0, "error: %s",
msg.c_str()); }
+
+/// \class RandomAccessObject
+/// \brief An interface to provide a file-like view over RADOS objects.
+class RandomAccessObject : public arrow::io::RandomAccessFile {
+ public:
+ explicit RandomAccessObject(cls_method_context_t hctx, int64_t file_size) {
+ hctx_ = hctx;
+ content_length_ = file_size;
+ chunks_ = std::vector<ceph::bufferlist*>();
+ }
+
+ ~RandomAccessObject() {
+ arrow::Status close_st = Close();
+ if (!close_st.ok()) {
+ LogSkyhookError("Could not close RandomAccessObject: " +
close_st.ToString());
+ }
+ }
+
+ /// Check if the file stream is closed.
+ arrow::Status CheckClosed() const {
+ if (closed_) {
+ return arrow::Status::Invalid("Operation on closed stream");
+ }
+ return arrow::Status::OK();
+ }
+
+ /// Check if the position of the object is valid.
+ arrow::Status CheckPosition(int64_t position, const char* action) const {
+ if (position < 0) {
+ return arrow::Status::Invalid("Cannot ", action, " from negative
position");
+ }
+ if (position > content_length_) {
+ return arrow::Status::IOError("Cannot ", action, " past end of file");
+ }
+ return arrow::Status::OK();
+ }
+
+ arrow::Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) {
+ return arrow::Status::NotImplemented(
+ "ReadAt has not been implemented in RandomAccessObject");
+ }
+
+ /// Read a specified number of bytes from a specified position.
+ arrow::Result<std::shared_ptr<arrow::Buffer>> ReadAt(int64_t position,
int64_t nbytes) {
+ RETURN_NOT_OK(CheckClosed());
+ RETURN_NOT_OK(CheckPosition(position, "read"));
+
+ // No need to allocate more than the remaining number of bytes
+ nbytes = std::min(nbytes, content_length_ - position);
+
+ if (nbytes > 0) {
+ ceph::bufferlist* bl = new ceph::bufferlist();
+ cls_cxx_read(hctx_, position, nbytes, bl);
+ chunks_.push_back(bl);
+ return std::make_shared<arrow::Buffer>((uint8_t*)bl->c_str(),
bl->length());
+ }
+ return std::make_shared<arrow::Buffer>("");
+ }
+
+ /// Read a specified number of bytes from the current position.
+ arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, ReadAt(pos_, nbytes));
+ pos_ += buffer->size();
+ return std::move(buffer);
+ }
+
+ /// Read a specified number of bytes from the current position into an
output stream.
+ arrow::Result<int64_t> Read(int64_t nbytes, void* out) {
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, ReadAt(pos_, nbytes, out));
+ pos_ += bytes_read;
+ return bytes_read;
+ }
+
+ /// Return the size of the file.
+ arrow::Result<int64_t> GetSize() {
+ RETURN_NOT_OK(CheckClosed());
+ return content_length_;
+ }
+
+ /// Sets the file-pointer offset, measured from the beginning of the
+ /// file, at which the next read or write occurs.
+ arrow::Status Seek(int64_t position) {
+ RETURN_NOT_OK(CheckClosed());
+ RETURN_NOT_OK(CheckPosition(position, "seek"));
+
+ pos_ = position;
+ return arrow::Status::OK();
+ }
+
+ /// Returns the file-pointer offset.
+ arrow::Result<int64_t> Tell() const {
+ RETURN_NOT_OK(CheckClosed());
+ return pos_;
+ }
+
+ /// Closes the file stream and deletes the chunks and releases the memory
+ /// used by the chunks.
+ arrow::Status Close() {
+ closed_ = true;
+ for (auto chunk : chunks_) {
+ delete chunk;
+ }
+ return arrow::Status::OK();
+ }
+
+ bool closed() const { return closed_; }
+
+ private:
+ cls_method_context_t hctx_;
+ bool closed_ = false;
+ int64_t pos_ = 0;
+ int64_t content_length_ = -1;
+ std::vector<ceph::bufferlist*> chunks_;
+};
+
+/// \brief Driver function to execute the Scan operations.
Review comment:
```suggestion
/// \brief Driver function to execute the Scan operations.
```
##########
File path: ci/scripts/install_ceph.sh
##########
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+apt update
+apt install -y ceph-common ceph-osd ceph-mon ceph-mgr ceph-mds ceph-fuse attr
Review comment:
Could you sort this package list in alphabetical order?
##########
File path: cpp/src/skyhook/CMakeLists.txt
##########
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitationsn
+# under the License.
+
+#
+# arrow_skyhook
+#
+# define project properties
+project(arrow_skyhook)
+
+# add the client subdirectory
+add_subdirectory(client)
+
+# define the targets to build
+add_custom_target(arrow_skyhook_client)
+add_custom_target(cls_skyhook)
+
+# define the dependencies
+find_package(librados REQUIRED)
+include_directories(${LIBRADOS_INCLUDE_DIR})
+set(ARROW_SKYHOOK_LINK_STATIC arrow_dataset_static)
+set(ARROW_SKYHOOK_LINK_SHARED arrow_dataset_shared)
+set(ARROW_SKYHOOK_LINK_STATIC ${ARROW_SKYHOOK_LINK_STATIC} ${LIBRADOS_LIBRARY})
+set(ARROW_SKYHOOK_LINK_SHARED ${ARROW_SKYHOOK_LINK_SHARED} ${LIBRADOS_LIBRARY})
+
+# define the client and cls sources
+set(ARROW_SKYHOOK_CLIENT_SOURCES client/file_skyhook.cc
protocol/rados_protocol.cc
+ protocol/skyhook_protocol.cc)
+set(ARROW_SKYHOOK_CLS_SOURCES cls/cls_skyhook.cc protocol/rados_protocol.cc
+ protocol/skyhook_protocol.cc)
+
+# define the client library
+add_arrow_lib(arrow_skyhook_client
+ SOURCES
+ ${ARROW_SKYHOOK_CLIENT_SOURCES}
+ OUTPUTS
+ ARROW_SKYHOOK_CLIENT_LIBRARIES
+ SHARED_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_SHARED}
+ STATIC_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_STATIC})
Review comment:
Could you add `PKG_CONFIG_NAME skyhook` and add
`cpp/src/skyhook/skyhook.pc.in`?
See also:
*
https://github.com/apache/arrow/blob/master/cpp/src/gandiva/CMakeLists.txt#L125-L126
* https://github.com/apache/arrow/blob/master/cpp/src/gandiva/gandiva.pc.in
##########
File path: cpp/src/skyhook/CMakeLists.txt
##########
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitationsn
+# under the License.
+
+#
+# arrow_skyhook
+#
+# define project properties
+project(arrow_skyhook)
+
+# add the client subdirectory
+add_subdirectory(client)
+
+# define the targets to build
+add_custom_target(arrow_skyhook_client)
+add_custom_target(cls_skyhook)
+
+# define the dependencies
+find_package(librados REQUIRED)
+include_directories(${LIBRADOS_INCLUDE_DIR})
+set(ARROW_SKYHOOK_LINK_STATIC arrow_dataset_static)
+set(ARROW_SKYHOOK_LINK_SHARED arrow_dataset_shared)
+set(ARROW_SKYHOOK_LINK_STATIC ${ARROW_SKYHOOK_LINK_STATIC} ${LIBRADOS_LIBRARY})
+set(ARROW_SKYHOOK_LINK_SHARED ${ARROW_SKYHOOK_LINK_SHARED} ${LIBRADOS_LIBRARY})
+
+# define the client and cls sources
+set(ARROW_SKYHOOK_CLIENT_SOURCES client/file_skyhook.cc
protocol/rados_protocol.cc
+ protocol/skyhook_protocol.cc)
+set(ARROW_SKYHOOK_CLS_SOURCES cls/cls_skyhook.cc protocol/rados_protocol.cc
+ protocol/skyhook_protocol.cc)
+
+# define the client library
+add_arrow_lib(arrow_skyhook_client
+ SOURCES
+ ${ARROW_SKYHOOK_CLIENT_SOURCES}
+ OUTPUTS
+ ARROW_SKYHOOK_CLIENT_LIBRARIES
+ SHARED_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_SHARED}
+ STATIC_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_STATIC})
+
+ # define the cls library
+add_arrow_lib(cls_skyhook
+ SOURCES
+ ${ARROW_SKYHOOK_CLS_SOURCES}
+ OUTPUTS
+ ARROW_SKYHOOK_CLS_LIBRARIES
+ SHARED_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_SHARED}
+ STATIC_LINK_LIBS
+ ${ARROW_SKYHOOK_LINK_STATIC})
+
+# finish building the project
+add_dependencies(arrow_skyhook_client ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
+add_dependencies(cls_skyhook ${ARROW_SKYHOOK_CLS_LIBRARIES})
+
+# define the test builds
+if(ARROW_TEST_LINKAGE STREQUAL "static")
+ set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_static
${ARROW_TEST_STATIC_LINK_LIBS})
+ set(ARROW_SKYHOOK_TEST_LINK_LIBS ${ARROW_SKYHOOK_TEST_LINK_LIBS}
+ ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
+else()
+ set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_shared
${ARROW_TEST_SHARED_LINK_LIBS})
+ set(ARROW_SKYHOOK_TEST_LINK_LIBS ${ARROW_SKYHOOK_TEST_LINK_LIBS}
+ ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
+endif()
Review comment:
```suggestion
if(ARROW_TEST_LINKAGE STREQUAL "static")
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_static
${ARROW_TEST_STATIC_LINK_LIBS})
else()
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_shared
${ARROW_TEST_SHARED_LINK_LIBS})
endif()
list(APPEND ARROW_SKYHOOK_TEST_LINK_LIBS ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
```
##########
File path: cpp/cmake_modules/DefineOptions.cmake
##########
@@ -224,6 +224,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL
"${CMAKE_CURRENT_SOURCE_DIR}")
define_option(ARROW_GANDIVA "Build the Gandiva libraries" OFF)
+ define_option(ARROW_SKYHOOK "Build the Skyhook libraries" OFF)
Review comment:
Could you keep this list in alphabetical order?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]