[jira] [Commented] (ARROW-1920) Add support for reading ORC files

ASF GitHub Bot (JIRA) Thu, 04 Jan 2018 07:12:17 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1920?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16311456#comment-16311456
 ]


ASF GitHub Bot commented on ARROW-1920:
---------------------------------------

wesm closed pull request #1418: ARROW-1920 [C++/Python] Add experimental reader 
for Apache ORC files
URL: https://github.com/apache/arrow/pull/1418
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 5f7b0a9a1..444386fdf 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -82,7 +82,7 @@ fi
 export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE
 
 pip install -r requirements.txt
-python setup.py build_ext --with-parquet --with-plasma \
+python setup.py build_ext --with-parquet --with-plasma --with-orc\
        install --single-version-externally-managed --record=record.text
 popd
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dd159ced2..ede13af5d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -119,6 +119,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL 
"${CMAKE_CURRENT_SOURCE_DIR}")
     "Build the Arrow GPU extensions (requires CUDA installation)"
     OFF)
 
+  option(ARROW_ORC
+    "Build the Arrow ORC adapter"
+    ON)
+
   option(ARROW_JEMALLOC
     "Build the Arrow jemalloc-based allocator"
     OFF)
@@ -226,6 +230,17 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS)
   set(ARROW_WITH_ZSTD ON)
 endif()
 
+if (MSVC)
+  # ORC doesn't build on windows
+  set(ARROW_ORC OFF)
+endif()
+
+if(ARROW_ORC)
+  set(ARROW_WITH_LZ4 ON)
+  set(ARROW_WITH_SNAPPY ON)
+  set(ARROW_WITH_ZLIB ON)
+endif()
+
 if(NOT ARROW_BUILD_TESTS)
   set(NO_TESTS 1)
 endif()
@@ -526,6 +541,13 @@ if (ARROW_WITH_GRPC)
     ${ARROW_STATIC_LINK_LIBS})
 endif()
 
+if (ARROW_ORC)
+  SET(ARROW_STATIC_LINK_LIBS
+    orc
+    protobuf
+    ${ARROW_STATIC_LINK_LIBS})
+endif()
+
 if (ARROW_STATIC_LINK_LIBS)
   add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS})
 endif()
diff --git a/cpp/cmake_modules/FindProtobuf.cmake 
b/cpp/cmake_modules/FindProtobuf.cmake
new file mode 100644
index 000000000..a42f4493a
--- /dev/null
+++ b/cpp/cmake_modules/FindProtobuf.cmake
@@ -0,0 +1,89 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# PROTOBUF_HOME environmental variable is used to check for Protobuf headers 
and static library
+
+# PROTOBUF_INCLUDE_DIR: directory containing headers
+# PROTOBUF_LIBS: directory containing Protobuf libraries
+# PROTOBUF_STATIC_LIB: location of protobuf.a
+# PROTOC_STATIC_LIB: location of protoc.a
+# PROTOBUF_EXECUTABLE: location of protoc
+# PROTOBUF_FOUND is set if Protobuf is found
+
+
+if( NOT "${PROTOBUF_HOME}" STREQUAL "")
+    file (TO_CMAKE_PATH "${PROTOBUF_HOME}" _protobuf_path)
+endif()
+
+message (STATUS "PROTOBUF_HOME: ${PROTOBUF_HOME}")
+
+find_path (PROTOBUF_INCLUDE_DIR google/protobuf/io/zero_copy_stream.h HINTS
+  ${_protobuf_path}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "include")
+
+find_path (PROTOBUF_INCLUDE_DIR google/protobuf/io/coded_stream.h HINTS
+  ${_protobuf_path}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "include")
+
+find_library (PROTOBUF_LIBRARY NAMES protobuf PATHS
+  ${_protobuf_path}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "lib")
+
+find_library (PROTOC_LIBRARY NAMES protoc PATHS
+  ${_protobuf_path}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "lib")
+
+find_program(PROTOBUF_EXECUTABLE protoc HINTS
+  ${_protobuf_path}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "bin")
+
+if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOC_LIBRARY AND 
PROTOBUF_EXECUTABLE)
+  set (PROTOBUF_FOUND TRUE)
+  get_filename_component (PROTOBUF_LIBS ${PROTOBUF_LIBRARY} PATH)
+  set (PROTOBUF_LIB_NAME protobuf)
+  set (PROTOC_LIB_NAME protoc)
+  set (PROTOBUF_STATIC_LIB 
${PROTOBUF_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${PROTOBUF_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set (PROTOC_STATIC_LIB 
${PROTOBUF_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${PROTOC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+else ()
+  set (PROTOBUF_FOUND FALSE)
+endif ()
+
+if (PROTOBUF_FOUND)
+  message (STATUS "Found the Protobuf headers: ${PROTOBUF_INCLUDE_DIR}")
+  message (STATUS "Found the Protobuf library: ${PROTOBUF_STATIC_LIB}")
+  message (STATUS "Found the Protoc library: ${PROTOC_STATIC_LIB}")
+  message (STATUS "Found the Protoc executable: ${PROTOBUF_EXECUTABLE}")
+else()
+  if (_protobuf_path)
+    set (PROTOBUF_ERR_MSG "Could not find Protobuf. Looked in 
${_protobuf_path}.")
+  else ()
+    set (PROTOBUF_ERR_MSG "Could not find Protobuf in system search paths.")
+  endif()
+
+  if (Protobuf_FIND_REQUIRED)
+    message (FATAL_ERROR "${PROTOBUF_ERR_MSG}")
+  else ()
+    message (STATUS "${PROTOBUF_ERR_MSG}")
+  endif ()
+endif()
+
+mark_as_advanced (
+  PROTOBUF_INCLUDE_DIR
+  PROTOBUF_LIBS
+  PROTOBUF_STATIC_LIB
+  PROTOC_STATIC_LIB
+)
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index b706aab45..4f6443417 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -29,7 +29,9 @@ set(SNAPPY_VERSION "1.1.3")
 set(BROTLI_VERSION "v0.6.0")
 set(LZ4_VERSION "1.7.5")
 set(ZSTD_VERSION "1.2.0")
+set(PROTOBUF_VERSION "2.6.0")
 set(GRPC_VERSION "94582910ad7f82ad447ecc72e6548cb669e4f7a9") # v1.6.5
+set(ORC_VERSION "cf00b67795717ab3eb04e950780ed6d104109017")
 
 string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE)
 
@@ -721,6 +723,7 @@ if (ARROW_WITH_LZ4)
 
   if("${LZ4_HOME}" STREQUAL "")
     set(LZ4_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/lz4_ep-prefix/src/lz4_ep")
+    set(LZ4_HOME "${LZ4_BUILD_DIR}")
     set(LZ4_INCLUDE_DIR "${LZ4_BUILD_DIR}/lib")
 
     if (MSVC)
@@ -865,3 +868,73 @@ if (ARROW_WITH_GRPC)
   endif()
 
 endif()
+
+if (ARROW_ORC)
+  # protobuf
+  if ("${PROTOBUF_HOME}" STREQUAL "")
+    set (PROTOBUF_PREFIX "${THIRDPARTY_DIR}/protobuf_ep-install")
+    set (PROTOBUF_HOME "${PROTOBUF_PREFIX}")
+    set (PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include")
+    set (PROTOBUF_STATIC_LIB 
"${PROTOBUF_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}")
+    set (PROTOBUF_SRC_URL 
"https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-${PROTOBUF_VERSION}.tar.gz";)
+
+    ExternalProject_Add(protobuf_ep
+      CONFIGURE_COMMAND "./configure" "--disable-shared" 
"--prefix=${PROTOBUF_PREFIX}" "CXXFLAGS=${EP_CXX_FLAGS}"
+      BUILD_IN_SOURCE 1
+      URL ${PROTOBUF_SRC_URL}
+      LOG_DOWNLOAD 1
+      LOG_CONFIGURE 1
+      LOG_BUILD 1
+      LOG_INSTALL 1
+      BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}")
+
+    set (PROTOBUF_VENDORED 1)
+  else ()
+    find_package (Protobuf REQUIRED)
+    set (PROTOBUF_VENDORED 0)
+  endif ()
+
+  include_directories (SYSTEM ${PROTOBUF_INCLUDE_DIR})
+  ADD_THIRDPARTY_LIB(protobuf
+    STATIC_LIB ${PROTOBUF_STATIC_LIB})
+
+  if (PROTOBUF_VENDORED)
+    add_dependencies (protobuf protobuf_ep)
+  endif ()
+
+  # orc
+  set(ORC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/orc_ep-install")
+  set(ORC_HOME "${ORC_PREFIX}")
+  set(ORC_INCLUDE_DIR "${ORC_PREFIX}/include")
+  set(ORC_STATIC_LIB 
"${ORC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}orc${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+  # Since LZ4 isn't installed, the header file is in ${LZ4_HOME}/lib instead of
+  # ${LZ4_HOME}/include, which forces us to specify the include directory
+  # manually as well.
+  set (ORC_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                      -DCMAKE_INSTALL_PREFIX=${ORC_PREFIX}
+                      -DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}
+                      -DBUILD_LIBHDFSPP=OFF
+                      -DBUILD_JAVA=OFF
+                      -DBUILD_TOOLS=OFF
+                      -DBUILD_CPP_TESTS=OFF
+                      -DINSTALL_VENDORED_LIBS=OFF
+                      -DPROTOBUF_HOME=${PROTOBUF_HOME}
+                      -DLZ4_HOME=${LZ4_HOME}
+                      -DLZ4_INCLUDE_DIR=${LZ4_INCLUDE_DIR}
+                      -DSNAPPY_HOME=${SNAPPY_HOME}
+                      -DZLIB_HOME=${ZLIB_HOME})
+
+  ExternalProject_Add(orc_ep
+    GIT_REPOSITORY "https://github.com/apache/orc";
+    GIT_TAG ${ORC_VERSION}
+    BUILD_BYPRODUCTS ${ORC_STATIC_LIB}
+    CMAKE_ARGS ${ORC_CMAKE_ARGS})
+
+  include_directories(SYSTEM ${ORC_INCLUDE_DIR})
+  ADD_THIRDPARTY_LIB(orc
+    STATIC_LIB ${ORC_STATIC_LIB})
+
+  add_dependencies(orc_ep protobuf lz4_static snappy zlib)
+  add_dependencies(orc orc_ep)
+endif()
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index d645cca22..ad86256e0 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -90,6 +90,11 @@ if (ARROW_WITH_ZSTD)
   SET(ARROW_SRCS util/compression_zstd.cc ${ARROW_SRCS})
 endif()
 
+if (ARROW_ORC)
+  add_subdirectory(adapters/orc)
+  SET(ARROW_SRCS adapters/orc/adapter.cc ${ARROW_SRCS})
+endif()
+
 if (NOT ARROW_BOOST_HEADER_ONLY)
   set(ARROW_SRCS ${ARROW_SRCS}
     io/hdfs.cc
diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt 
b/cpp/src/arrow/adapters/orc/CMakeLists.txt
new file mode 100644
index 000000000..eb7194cd4
--- /dev/null
+++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#######################################
+# arrow_orc
+#######################################
+
+# Headers: top level
+install(FILES
+        adapter.h
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/adapters/orc")
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc 
b/cpp/src/arrow/adapters/orc/adapter.cc
new file mode 100644
index 000000000..473c90f92
--- /dev/null
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -0,0 +1,697 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/adapters/orc/adapter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/table_builder.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+#include "orc/OrcFile.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+#define ORC_THROW_NOT_OK(s)                   \
+  do {                                        \
+    Status _s = (s);                          \
+    if (!_s.ok()) {                           \
+      std::stringstream ss;                   \
+      ss << "Arrow error: " << _s.ToString(); \
+      throw liborc::ParseError(ss.str());     \
+    }                                         \
+  } while (0)
+
+class ArrowInputFile : public liborc::InputStream {
+ public:
+  explicit ArrowInputFile(const std::shared_ptr<io::ReadableFileInterface>& 
file)
+      : file_(file) {}
+
+  uint64_t getLength() const override {
+    int64_t size;
+    ORC_THROW_NOT_OK(file_->GetSize(&size));
+    return static_cast<uint64_t>(size);
+  }
+
+  uint64_t getNaturalReadSize() const override { return 128 * 1024; }
+
+  void read(void* buf, uint64_t length, uint64_t offset) override {
+    int64_t bytes_read;
+
+    ORC_THROW_NOT_OK(file_->ReadAt(offset, length, &bytes_read, buf));
+
+    if (static_cast<uint64_t>(bytes_read) != length) {
+      throw liborc::ParseError("Short read from arrow input file");
+    }
+  }
+
+  const std::string& getName() const override {
+    static const std::string filename("ArrowInputFile");
+    return filename;
+  }
+
+ private:
+  std::shared_ptr<io::ReadableFileInterface> file_;
+};
+
+struct StripeInformation {
+  uint64_t offset;
+  uint64_t length;
+  uint64_t num_rows;
+};
+
+Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out) {
+  // When subselecting fields on read, liborc will set some nodes to nullptr,
+  // so we need to check for nullptr before progressing
+  if (type == nullptr) {
+    *out = null();
+    return Status::OK();
+  }
+  liborc::TypeKind kind = type->getKind();
+  switch (kind) {
+    case liborc::BOOLEAN:
+      *out = boolean();
+      break;
+    case liborc::BYTE:
+      *out = int8();
+      break;
+    case liborc::SHORT:
+      *out = int16();
+      break;
+    case liborc::INT:
+      *out = int32();
+      break;
+    case liborc::LONG:
+      *out = int64();
+      break;
+    case liborc::FLOAT:
+      *out = float32();
+      break;
+    case liborc::DOUBLE:
+      *out = float64();
+      break;
+    case liborc::VARCHAR:
+    case liborc::STRING:
+      *out = utf8();
+      break;
+    case liborc::BINARY:
+      *out = binary();
+      break;
+    case liborc::CHAR:
+      *out = fixed_size_binary(type->getMaximumLength());
+      break;
+    case liborc::TIMESTAMP:
+      *out = timestamp(TimeUnit::NANO);
+      break;
+    case liborc::DATE:
+      *out = date32();
+      break;
+    case liborc::DECIMAL: {
+      if (type->getPrecision() == 0) {
+        // In HIVE 0.11/0.12 precision is set as 0, but means max precision
+        *out = decimal(38, 6);
+      } else {
+        *out = decimal(type->getPrecision(), type->getScale());
+      }
+      break;
+    }
+    case liborc::LIST: {
+      if (type->getSubtypeCount() != 1) {
+        return Status::Invalid("Invalid Orc List type");
+      }
+      std::shared_ptr<DataType> elemtype;
+      RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype));
+      *out = list(elemtype);
+      break;
+    }
+    case liborc::MAP: {
+      if (type->getSubtypeCount() != 2) {
+        return Status::Invalid("Invalid Orc Map type");
+      }
+      std::shared_ptr<DataType> keytype;
+      std::shared_ptr<DataType> valtype;
+      RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &keytype));
+      RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &valtype));
+      *out = list(struct_({field("key", keytype), field("value", valtype)}));
+      break;
+    }
+    case liborc::STRUCT: {
+      int size = type->getSubtypeCount();
+      std::vector<std::shared_ptr<Field>> fields;
+      for (int child = 0; child < size; ++child) {
+        std::shared_ptr<DataType> elemtype;
+        RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elemtype));
+        std::string name = type->getFieldName(child);
+        fields.push_back(field(name, elemtype));
+      }
+      *out = struct_(fields);
+      break;
+    }
+    case liborc::UNION: {
+      int size = type->getSubtypeCount();
+      std::vector<std::shared_ptr<Field>> fields;
+      std::vector<uint8_t> type_codes;
+      for (int child = 0; child < size; ++child) {
+        std::shared_ptr<DataType> elemtype;
+        RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elemtype));
+        fields.push_back(field("_union_" + std::to_string(child), elemtype));
+        type_codes.push_back(static_cast<uint8_t>(child));
+      }
+      *out = union_(fields, type_codes);
+      break;
+    }
+    default: {
+      std::stringstream ss;
+      ss << "Unknown Orc type kind: " << kind;
+      return Status::Invalid(ss.str());
+    }
+  }
+  return Status::OK();
+}
+
+// The number of rows to read in a ColumnVectorBatch
+constexpr int64_t kReadRowsBatch = 1000;
+
+// The numer of nanoseconds in a second
+constexpr int64_t kOneSecondNanos = 1000000000LL;
+
+class ORCFileReader::Impl {
+ public:
+  Impl() {}
+  ~Impl() {}
+
+  Status Open(const std::shared_ptr<io::ReadableFileInterface>& file, 
MemoryPool* pool) {
+    std::unique_ptr<ArrowInputFile> io_wrapper(new ArrowInputFile(file));
+    liborc::ReaderOptions options;
+    std::unique_ptr<liborc::Reader> liborc_reader;
+    try {
+      liborc_reader = createReader(std::move(io_wrapper), options);
+    } catch (const liborc::ParseError& e) {
+      return Status::IOError(e.what());
+    }
+    pool_ = pool;
+    reader_ = std::move(liborc_reader);
+
+    return Init();
+  }
+
+  Status Init() {
+    int64_t nstripes = reader_->getNumberOfStripes();
+    stripes_.resize(nstripes);
+    std::unique_ptr<liborc::StripeInformation> stripe;
+    for (int i = 0; i < nstripes; ++i) {
+      stripe = reader_->getStripe(i);
+      stripes_[i] = StripeInformation(
+          {stripe->getOffset(), stripe->getLength(), 
stripe->getNumberOfRows()});
+    }
+    return Status::OK();
+  }
+
+  int64_t NumberOfStripes() { return stripes_.size(); }
+
+  int64_t NumberOfRows() { return reader_->getNumberOfRows(); }
+
+  Status ReadSchema(std::shared_ptr<Schema>* out) {
+    const liborc::Type& type = reader_->getType();
+    return GetArrowSchema(type, out);
+  }
+
+  Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* 
out) {
+    if (type.getKind() != liborc::STRUCT) {
+      return Status::NotImplemented(
+          "Only ORC files with a top-level struct "
+          "can be handled");
+    }
+    int size = type.getSubtypeCount();
+    std::vector<std::shared_ptr<Field>> fields;
+    for (int child = 0; child < size; ++child) {
+      std::shared_ptr<DataType> elemtype;
+      RETURN_NOT_OK(GetArrowType(type.getSubtype(child), &elemtype));
+      std::string name = type.getFieldName(child);
+      fields.push_back(field(name, elemtype));
+    }
+    std::list<std::string> keys = reader_->getMetadataKeys();
+    std::shared_ptr<KeyValueMetadata> metadata;
+    if (!keys.empty()) {
+      metadata = std::make_shared<KeyValueMetadata>();
+      for (auto it = keys.begin(); it != keys.end(); ++it) {
+        metadata->Append(*it, reader_->getMetadataValue(*it));
+      }
+    }
+
+    *out = std::make_shared<Schema>(fields, metadata);
+    return Status::OK();
+  }
+
+  Status Read(std::shared_ptr<Table>* out) {
+    liborc::RowReaderOptions opts;
+    return ReadTable(opts, out);
+  }
+
+  Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* 
out) {
+    liborc::RowReaderOptions opts;
+    RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+    return ReadTable(opts, out);
+  }
+
+  Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
+    liborc::RowReaderOptions opts;
+    RETURN_NOT_OK(SelectStripe(&opts, stripe));
+    return ReadBatch(opts, stripes_[stripe].num_rows, out);
+  }
+
+  Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+                    std::shared_ptr<RecordBatch>* out) {
+    liborc::RowReaderOptions opts;
+    RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+    RETURN_NOT_OK(SelectStripe(&opts, stripe));
+    return ReadBatch(opts, stripes_[stripe].num_rows, out);
+  }
+
+  Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
+    if (stripe < 0 || stripe >= NumberOfStripes()) {
+      std::stringstream ss;
+      ss << "Out of bounds stripe: " << stripe;
+      return Status::Invalid(ss.str());
+    }
+    opts->range(stripes_[stripe].offset, stripes_[stripe].length);
+    return Status::OK();
+  }
+
+  Status SelectIndices(liborc::RowReaderOptions* opts,
+                       const std::vector<int>& include_indices) {
+    std::list<uint64_t> include_indices_list;
+    for (auto it = include_indices.begin(); it != include_indices.end(); ++it) 
{
+      if (*it < 0) {
+        return Status::Invalid("Negative field index");
+      }
+      include_indices_list.push_back(*it);
+    }
+    opts->includeTypes(include_indices_list);
+    return Status::OK();
+  }
+
+  Status ReadTable(const liborc::RowReaderOptions& row_opts,
+                   std::shared_ptr<Table>* out) {
+    liborc::RowReaderOptions opts(row_opts);
+    std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
+    for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
+      opts.range(stripes_[stripe].offset, stripes_[stripe].length);
+      RETURN_NOT_OK(ReadBatch(opts, stripes_[stripe].num_rows, 
&batches[stripe]));
+    }
+    return Table::FromRecordBatches(batches, out);
+  }
+
+  Status ReadBatch(const liborc::RowReaderOptions& opts, int64_t nrows,
+                   std::shared_ptr<RecordBatch>* out) {
+    std::unique_ptr<liborc::RowReader> rowreader;
+    std::unique_ptr<liborc::ColumnVectorBatch> batch;
+    try {
+      rowreader = reader_->createRowReader(opts);
+      batch = rowreader->createRowBatch(std::min(nrows, kReadRowsBatch));
+    } catch (const liborc::ParseError& e) {
+      return Status::Invalid(e.what());
+    }
+    const liborc::Type& type = rowreader->getSelectedType();
+    std::shared_ptr<Schema> schema;
+    RETURN_NOT_OK(GetArrowSchema(type, &schema));
+
+    std::unique_ptr<RecordBatchBuilder> builder;
+    RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
+
+    // The top-level type must be a struct to read into an arrow table
+    const auto& struct_batch = static_cast<liborc::StructVectorBatch&>(*batch);
+
+    while (rowreader->next(*batch)) {
+      for (int i = 0; i < builder->num_fields(); i++) {
+        RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 
0,
+                                  batch->numElements, builder->GetField(i)));
+      }
+    }
+    RETURN_NOT_OK(builder->Flush(out));
+    return Status::OK();
+  }
+
+  Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* 
batch,
+                     int64_t offset, int64_t length, ArrayBuilder* builder) {
+    if (type == nullptr) {
+      return Status::OK();
+    }
+    liborc::TypeKind kind = type->getKind();
+    switch (kind) {
+      case liborc::STRUCT:
+        return AppendStructBatch(type, batch, offset, length, builder);
+      case liborc::LIST:
+        return AppendListBatch(type, batch, offset, length, builder);
+      case liborc::MAP:
+        return AppendMapBatch(type, batch, offset, length, builder);
+      case liborc::LONG:
+        return AppendNumericBatch<Int64Builder, liborc::LongVectorBatch, 
int64_t>(
+            batch, offset, length, builder);
+      case liborc::INT:
+        return AppendNumericBatchCast<Int32Builder, int32_t, 
liborc::LongVectorBatch,
+                                      int64_t>(batch, offset, length, builder);
+      case liborc::SHORT:
+        return AppendNumericBatchCast<Int16Builder, int16_t, 
liborc::LongVectorBatch,
+                                      int64_t>(batch, offset, length, builder);
+      case liborc::BYTE:
+        return AppendNumericBatchCast<Int8Builder, int8_t, 
liborc::LongVectorBatch,
+                                      int64_t>(batch, offset, length, builder);
+      case liborc::DOUBLE:
+        return AppendNumericBatch<DoubleBuilder, liborc::DoubleVectorBatch, 
double>(
+            batch, offset, length, builder);
+      case liborc::FLOAT:
+        return AppendNumericBatchCast<FloatBuilder, float, 
liborc::DoubleVectorBatch,
+                                      double>(batch, offset, length, builder);
+      case liborc::BOOLEAN:
+        return AppendBoolBatch(batch, offset, length, builder);
+      case liborc::VARCHAR:
+      case liborc::STRING:
+        return AppendBinaryBatch<StringBuilder>(batch, offset, length, 
builder);
+      case liborc::BINARY:
+        return AppendBinaryBatch<BinaryBuilder>(batch, offset, length, 
builder);
+      case liborc::CHAR:
+        return AppendFixedBinaryBatch(batch, offset, length, builder);
+      case liborc::DATE:
+        return AppendNumericBatchCast<Date32Builder, int32_t, 
liborc::LongVectorBatch,
+                                      int64_t>(batch, offset, length, builder);
+      case liborc::TIMESTAMP:
+        return AppendTimestampBatch(batch, offset, length, builder);
+      case liborc::DECIMAL:
+        return AppendDecimalBatch(type, batch, offset, length, builder);
+      default:
+        std::stringstream ss;
+        ss << "Not implemented type kind: " << kind;
+        return Status::NotImplemented(ss.str());
+    }
+  }
+
+  Status AppendStructBatch(const liborc::Type* type, 
liborc::ColumnVectorBatch* cbatch,
+                           int64_t offset, int64_t length, ArrayBuilder* 
abuilder) {
+    auto builder = static_cast<StructBuilder*>(abuilder);
+    auto batch = static_cast<liborc::StructVectorBatch*>(cbatch);
+
+    const uint8_t* valid_bytes = nullptr;
+    if (batch->hasNulls) {
+      valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + 
offset;
+    }
+    RETURN_NOT_OK(builder->Append(length, valid_bytes));
+
+    for (int i = 0; i < builder->num_fields(); i++) {
+      RETURN_NOT_OK(AppendBatch(type->getSubtype(i), batch->fields[i], offset, 
length,
+                                builder->field_builder(i)));
+    }
+    return Status::OK();
+  }
+
+  Status AppendListBatch(const liborc::Type* type, liborc::ColumnVectorBatch* 
cbatch,
+                         int64_t offset, int64_t length, ArrayBuilder* 
abuilder) {
+    auto builder = static_cast<ListBuilder*>(abuilder);
+    auto batch = static_cast<liborc::ListVectorBatch*>(cbatch);
+    liborc::ColumnVectorBatch* elements = batch->elements.get();
+    const liborc::Type* elemtype = type->getSubtype(0);
+
+    const bool has_nulls = batch->hasNulls;
+    for (int i = offset; i < length + offset; i++) {
+      if (!has_nulls || batch->notNull[i]) {
+        int64_t start = batch->offsets[i];
+        int64_t end = batch->offsets[i + 1];
+        RETURN_NOT_OK(builder->Append());
+        RETURN_NOT_OK(AppendBatch(elemtype, elements, start, end - start,
+                                  builder->value_builder()));
+      } else {
+        RETURN_NOT_OK(builder->AppendNull());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AppendMapBatch(const liborc::Type* type, liborc::ColumnVectorBatch* 
cbatch,
+                        int64_t offset, int64_t length, ArrayBuilder* 
abuilder) {
+    auto list_builder = static_cast<ListBuilder*>(abuilder);
+    auto struct_builder = 
static_cast<StructBuilder*>(list_builder->value_builder());
+    auto batch = static_cast<liborc::MapVectorBatch*>(cbatch);
+    liborc::ColumnVectorBatch* keys = batch->keys.get();
+    liborc::ColumnVectorBatch* vals = batch->elements.get();
+    const liborc::Type* keytype = type->getSubtype(0);
+    const liborc::Type* valtype = type->getSubtype(1);
+
+    const bool has_nulls = batch->hasNulls;
+    for (int i = offset; i < length + offset; i++) {
+      RETURN_NOT_OK(list_builder->Append());
+      int64_t start = batch->offsets[i];
+      int64_t list_length = batch->offsets[i + 1] - start;
+      if (list_length && (!has_nulls || batch->notNull[i])) {
+        RETURN_NOT_OK(struct_builder->Append(list_length, nullptr));
+        RETURN_NOT_OK(AppendBatch(keytype, keys, start, list_length,
+                                  struct_builder->field_builder(0)));
+        RETURN_NOT_OK(AppendBatch(valtype, vals, start, list_length,
+                                  struct_builder->field_builder(1)));
+      }
+    }
+    return Status::OK();
+  }
+
+  template <class builder_type, class batch_type, class elem_type>
+  Status AppendNumericBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset,
+                            int64_t length, ArrayBuilder* abuilder) {
+    auto builder = static_cast<builder_type*>(abuilder);
+    auto batch = static_cast<batch_type*>(cbatch);
+
+    if (length == 0) {
+      return Status::OK();
+    }
+    const uint8_t* valid_bytes = nullptr;
+    if (batch->hasNulls) {
+      valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + 
offset;
+    }
+    const elem_type* source = batch->data.data() + offset;
+    RETURN_NOT_OK(builder->Append(source, length, valid_bytes));
+    return Status::OK();
+  }
+
+  template <class builder_type, class target_type, class batch_type, class 
source_type>
+  Status AppendNumericBatchCast(liborc::ColumnVectorBatch* cbatch, int64_t 
offset,
+                                int64_t length, ArrayBuilder* abuilder) {
+    auto builder = static_cast<builder_type*>(abuilder);
+    auto batch = static_cast<batch_type*>(cbatch);
+
+    if (length == 0) {
+      return Status::OK();
+    }
+    int start = builder->length();
+
+    const uint8_t* valid_bytes = nullptr;
+    if (batch->hasNulls) {
+      valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + 
offset;
+    }
+    RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length));
+
+    const source_type* source = batch->data.data() + offset;
+    target_type* target = 
reinterpret_cast<target_type*>(builder->data()->mutable_data());
+
+    std::copy(source, source + length, target + start);
+
+    return Status::OK();
+  }
+
+  Status AppendBoolBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset,
+                         int64_t length, ArrayBuilder* abuilder) {
+    auto builder = static_cast<BooleanBuilder*>(abuilder);
+    auto batch = static_cast<liborc::LongVectorBatch*>(cbatch);
+
+    if (length == 0) {
+      return Status::OK();
+    }
+    int start = builder->length();
+
+    const uint8_t* valid_bytes = nullptr;
+    if (batch->hasNulls) {
+      valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + 
offset;
+    }
+    RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length));
+
+    const int64_t* source = batch->data.data() + offset;
+    uint8_t* target = 
reinterpret_cast<uint8_t*>(builder->data()->mutable_data());
+
+    for (int i = 0; i < length; i++) {
+      if (source[i]) {
+        BitUtil::SetBit(target, start + i);
+      } else {
+        BitUtil::ClearBit(target, start + i);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AppendTimestampBatch(liborc::ColumnVectorBatch* cbatch, int64_t 
offset,
+                              int64_t length, ArrayBuilder* abuilder) {
+    auto builder = static_cast<TimestampBuilder*>(abuilder);
+    auto batch = static_cast<liborc::TimestampVectorBatch*>(cbatch);
+
+    if (length == 0) {
+      return Status::OK();
+    }
+    int start = builder->length();
+
+    const uint8_t* valid_bytes = nullptr;
+    if (batch->hasNulls) {
+      valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + 
offset;
+    }
+    RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length));
+
+    const int64_t* seconds = batch->data.data() + offset;
+    const int64_t* nanos = batch->nanoseconds.data() + offset;
+    int64_t* target = 
reinterpret_cast<int64_t*>(builder->data()->mutable_data());
+
+    for (int i = 0; i < length; i++) {
+      // TODO: boundscheck this, as ORC supports higher resolution timestamps
+      // than arrow for nanosecond resolution
+      target[start + i] = seconds[i] * kOneSecondNanos + nanos[i];
+    }
+    return Status::OK();
+  }
+
+  template <class builder_type>
+  Status AppendBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset,
+                           int64_t length, ArrayBuilder* abuilder) {
+    auto builder = static_cast<builder_type*>(abuilder);
+    auto batch = static_cast<liborc::StringVectorBatch*>(cbatch);
+
+    const bool has_nulls = batch->hasNulls;
+    for (int i = offset; i < length + offset; i++) {
+      if (!has_nulls || batch->notNull[i]) {
+        RETURN_NOT_OK(builder->Append(batch->data[i], batch->length[i]));
+      } else {
+        RETURN_NOT_OK(builder->AppendNull());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t 
offset,
+                                int64_t length, ArrayBuilder* abuilder) {
+    auto builder = static_cast<FixedSizeBinaryBuilder*>(abuilder);
+    auto batch = static_cast<liborc::StringVectorBatch*>(cbatch);
+
+    const bool has_nulls = batch->hasNulls;
+    for (int i = offset; i < length + offset; i++) {
+      if (!has_nulls || batch->notNull[i]) {
+        RETURN_NOT_OK(builder->Append(batch->data[i]));
+      } else {
+        RETURN_NOT_OK(builder->AppendNull());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AppendDecimalBatch(const liborc::Type* type, 
liborc::ColumnVectorBatch* cbatch,
+                            int64_t offset, int64_t length, ArrayBuilder* 
abuilder) {
+    auto builder = static_cast<Decimal128Builder*>(abuilder);
+
+    const bool has_nulls = cbatch->hasNulls;
+    if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+      auto batch = static_cast<liborc::Decimal128VectorBatch*>(cbatch);
+      for (int i = offset; i < length + offset; i++) {
+        if (!has_nulls || batch->notNull[i]) {
+          RETURN_NOT_OK(builder->Append(
+              Decimal128(batch->values[i].getHighBits(), 
batch->values[i].getLowBits())));
+        } else {
+          RETURN_NOT_OK(builder->AppendNull());
+        }
+      }
+    } else {
+      auto batch = static_cast<liborc::Decimal64VectorBatch*>(cbatch);
+      for (int i = offset; i < length + offset; i++) {
+        if (!has_nulls || batch->notNull[i]) {
+          RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i])));
+        } else {
+          RETURN_NOT_OK(builder->AppendNull());
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  MemoryPool* pool_;
+  std::unique_ptr<liborc::Reader> reader_;
+  std::vector<StripeInformation> stripes_;
+};
+
+ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); }
+
+ORCFileReader::~ORCFileReader() {}
+
+Status ORCFileReader::Open(const std::shared_ptr<io::ReadableFileInterface>& 
file,
+                           MemoryPool* pool, std::unique_ptr<ORCFileReader>* 
reader) {
+  auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
+  RETURN_NOT_OK(result->impl_->Open(file, pool));
+  *reader = std::move(result);
+  return Status::OK();
+}
+
+Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
+  return impl_->ReadSchema(out);
+}
+
+Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return 
impl_->Read(out); }
+
+Status ORCFileReader::Read(const std::vector<int>& include_indices,
+                           std::shared_ptr<Table>* out) {
+  return impl_->Read(include_indices, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* 
out) {
+  return impl_->ReadStripe(stripe, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector<int>& 
include_indices,
+                                 std::shared_ptr<RecordBatch>* out) {
+  return impl_->ReadStripe(stripe, include_indices, out);
+}
+
+int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
+
+int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
+
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
diff --git a/cpp/src/arrow/adapters/orc/adapter.h 
b/cpp/src/arrow/adapters/orc/adapter.h
new file mode 100644
index 000000000..6438658fd
--- /dev/null
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_ORC_CONVERTER_H
+#define ARROW_ORC_CONVERTER_H
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+/// \class ORCFileReader
+/// \brief Read an Arrow Table or RecordBatch from an ORC file.
+class ARROW_EXPORT ORCFileReader {
+ public:
+  ~ORCFileReader();
+
+  /// \brief Create a new ORC reader
+  ///
+  /// \param[in] file the data source
+  /// \param[in] pool a MemoryPool to use for buffer allocations
+  /// \param[out] reader the returned reader object
+  /// \return Status
+  static Status Open(const std::shared_ptr<io::ReadableFileInterface>& file,
+                     MemoryPool* pool, std::unique_ptr<ORCFileReader>* reader);
+
+  /// \brief Return the schema read from the ORC file
+  ///
+  /// \param[out] out the returned Schema object
+  Status ReadSchema(std::shared_ptr<Schema>* out);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[out] out the returned RecordBatch
+  Status Read(std::shared_ptr<Table>* out);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] include_indices the selected field indices to read
+  /// \param[out] out the returned RecordBatch
+  Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* 
out);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \param[out] out the returned RecordBatch
+  Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \param[in] include_indices the selected field indices to read
+  /// \param[out] out the returned RecordBatch
+  Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+                    std::shared_ptr<RecordBatch>* out);
+
+  /// \brief The number of stripes in the file
+  int64_t NumberOfStripes();
+
+  /// \brief The number of rows in the file
+  int64_t NumberOfRows();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  ORCFileReader();
+};
+
+}  // namespace orc
+
+}  // namespace adapters
+
+}  // namespace arrow
+
+#endif  // ARROW_ORC_CONVERTER_H
diff --git a/cpp/src/arrow/symbols.map b/cpp/src/arrow/symbols.map
index f216d8650..c5d23793c 100644
--- a/cpp/src/arrow/symbols.map
+++ b/cpp/src/arrow/symbols.map
@@ -55,6 +55,8 @@
     ERR_getErrorString;
     # jemalloc
     je_arrow_*;
+    # ORC destructors
+    _ZThn8_N3orc*;
 
     extern "C++" {
       # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically
@@ -65,6 +67,8 @@
 
       # Statically linked C++ dependencies
       boost::*;
+      google::*;
+      orc::*;
       snappy::*;
     };
 };
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index cbbb464d0..e9de08ba1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -70,6 +70,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL 
"${CMAKE_CURRENT_SOURCE_DIR}")
   option(PYARROW_BUILD_PLASMA
     "Build the PyArrow Plasma integration"
     OFF)
+  option(PYARROW_BUILD_ORC
+    "Build the PyArrow ORC integration"
+    OFF)
   option(PYARROW_BUNDLE_ARROW_CPP
     "Bundle the Arrow C++ libraries"
     OFF)
@@ -357,6 +360,14 @@ if (PYARROW_BUILD_PLASMA)
   file(COPY ${PLASMA_EXECUTABLE} DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY})
 endif()
 
+
+if (PYARROW_BUILD_ORC)
+  ## ORC
+  set(CYTHON_EXTENSIONS
+      ${CYTHON_EXTENSIONS}
+      _orc)
+endif()
+
 ############################################################
 # Setup and build Cython modules
 ############################################################
diff --git a/python/manylinux1/scripts/check_arrow_visibility.sh 
b/python/manylinux1/scripts/check_arrow_visibility.sh
index 27a30f747..bed357edf 100755
--- a/python/manylinux1/scripts/check_arrow_visibility.sh
+++ b/python/manylinux1/scripts/check_arrow_visibility.sh
@@ -17,10 +17,13 @@
 # under the License.
 
 nm -D -C /arrow-dist/lib64/libarrow.so > nm_arrow.log
+grep ' T ' nm_arrow.log | grep -v arrow > visible_symbols.log
 
-if [[ `grep ' T ' nm_arrow.log | grep -v arrow | wc -l` -eq 2 ]]
+if [[ `cat visible_symbols.log | wc -l` -eq 2 ]]
 then
     exit 0
 fi
 
+cat visible_symbols.log
+
 exit 1
diff --git a/python/pyarrow/_orc.pxd b/python/pyarrow/_orc.pxd
new file mode 100644
index 000000000..411691510
--- /dev/null
+++ b/python/pyarrow/_orc.pxd
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# distutils: language = c++
+
+from libc.string cimport const_char
+from libcpp.vector cimport vector as std_vector
+from pyarrow.includes.common cimport *
+from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus,
+                                        CTable, CMemoryPool,
+                                        CKeyValueMetadata,
+                                        CRecordBatch,
+                                        CTable,
+                                        RandomAccessFile, OutputStream,
+                                        TimeUnit)
+
+
+cdef extern from "arrow/adapters/orc/adapter.h" namespace 
"arrow::adapters::orc" nogil:
+    cdef cppclass ORCFileReader:
+
+        @staticmethod
+        CStatus Open(const shared_ptr[RandomAccessFile]& file,
+                     CMemoryPool* pool,
+                     unique_ptr[ORCFileReader]* reader)
+
+        CStatus ReadSchema(shared_ptr[CSchema]* out)
+
+        CStatus ReadStripe(int64_t stripe, shared_ptr[CRecordBatch]* out)
+        CStatus ReadStripe(int64_t stripe, std_vector[int], 
shared_ptr[CRecordBatch]* out)
+
+        CStatus Read(shared_ptr[CTable]* out)
+        CStatus Read(std_vector[int], shared_ptr[CTable]* out)
+
+        int64_t NumberOfStripes()
+
+        int64_t NumberOfRows()
diff --git a/python/pyarrow/_orc.pyx b/python/pyarrow/_orc.pyx
new file mode 100644
index 000000000..7ff4bac6d
--- /dev/null
+++ b/python/pyarrow/_orc.pyx
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+
+from cython.operator cimport dereference as deref
+from libcpp.vector cimport vector as std_vector
+from pyarrow.includes.common cimport *
+from pyarrow.includes.libarrow cimport *
+from pyarrow.lib cimport (check_status,
+                          MemoryPool, maybe_unbox_memory_pool,
+                          Schema, pyarrow_wrap_schema,
+                          RecordBatch,
+                          pyarrow_wrap_table,
+                          get_reader)
+import six
+
+
+cdef class ORCReader:
+    cdef:
+        object source
+        CMemoryPool* allocator
+        unique_ptr[ORCFileReader] reader
+
+    def __cinit__(self, MemoryPool memory_pool=None):
+        self.allocator = maybe_unbox_memory_pool(memory_pool)
+
+    def open(self, object source):
+        cdef:
+            shared_ptr[RandomAccessFile] rd_handle
+
+        self.source = source
+
+        get_reader(source, &rd_handle)
+        with nogil:
+            check_status(ORCFileReader.Open(rd_handle, self.allocator,
+                                             &self.reader))
+
+    def schema(self):
+        """
+        The arrow schema for this file.
+
+        Returns
+        -------
+        schema : pyarrow.Schema
+        """
+        cdef:
+            shared_ptr[CSchema] sp_arrow_schema
+
+        with nogil:
+            check_status(deref(self.reader).ReadSchema(&sp_arrow_schema))
+
+        return pyarrow_wrap_schema(sp_arrow_schema)
+
+    def nrows(self):
+        return deref(self.reader).NumberOfRows();
+
+    def nstripes(self):
+        return deref(self.reader).NumberOfStripes();
+
+    def read_stripe(self, n, include_indices=None):
+        cdef:
+            shared_ptr[CRecordBatch] sp_record_batch
+            RecordBatch batch
+            int64_t stripe
+            std_vector[int] indices
+
+        stripe = n
+
+        if include_indices is None:
+            with nogil:
+                check_status(deref(self.reader).ReadStripe(stripe, 
&sp_record_batch))
+        else:
+            indices = include_indices
+            with nogil:
+                check_status(deref(self.reader).ReadStripe(stripe, indices, 
&sp_record_batch))
+
+        batch = RecordBatch()
+        batch.init(sp_record_batch)
+        return batch
+
+    def read(self, include_indices=None):
+        cdef:
+            shared_ptr[CTable] sp_table
+            std_vector[int] indices
+
+        if include_indices is None:
+            with nogil:
+                check_status(deref(self.reader).Read(&sp_table))
+        else:
+            indices = include_indices
+            with nogil:
+                check_status(deref(self.reader).Read(indices, &sp_table))
+
+        return pyarrow_wrap_table(sp_table)
diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py
new file mode 100644
index 000000000..22451d521
--- /dev/null
+++ b/python/pyarrow/orc.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from itertools import count
+from numbers import Integral
+
+from pyarrow import _orc
+from pyarrow import types
+from pyarrow.lib import Schema
+
+
+def _is_map(typ):
+    return (types.is_list(typ) and
+            types.is_struct(typ.value_type) and
+            typ.value_type.num_children == 2 and
+            typ.value_type[0].name == 'key' and
+            typ.value_type[1].name == 'value')
+
+
+def _traverse(typ, counter):
+    if isinstance(typ, Schema) or types.is_struct(typ):
+        for field in typ:
+            path = (field.name,)
+            yield path, next(counter)
+            for sub, c in _traverse(field.type, counter):
+                yield path + sub, c
+    elif _is_map(typ):
+        for sub_c in _traverse(typ.value_type, counter):
+            yield sub_c
+    elif types.is_list(typ):
+        # Skip one index for list type, since this can never be selected
+        # directly
+        next(counter)
+        for sub_c in _traverse(typ.value_type, counter):
+            yield sub_c
+    elif types.is_union(typ):
+        # Union types not supported, just skip the indexes
+        for dtype in typ:
+            next(counter)
+            for sub_c in _traverse(dtype, counter):
+                pass
+
+
+def _schema_to_indices(schema):
+    return {'.'.join(i): c for i, c in _traverse(schema, count(1))}
+
+
+class ORCFile(object):
+    """
+    Reader interface for a single ORC file
+
+    Parameters
+    ----------
+    source : str or pyarrow.io.NativeFile
+        Readable source. For passing Python file objects or byte buffers,
+        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
+    """
+    def __init__(self, source):
+        self.reader = _orc.ORCReader()
+        self.reader.open(source)
+        self._column_index_lookup = _schema_to_indices(self.schema)
+
+    @property
+    def schema(self):
+        """The file schema, as an arrow schema"""
+        return self.reader.schema()
+
+    @property
+    def nrows(self):
+        """The number of rows in the file"""
+        return self.reader.nrows()
+
+    @property
+    def nstripes(self):
+        """The number of stripes in the file"""
+        return self.reader.nstripes()
+
+    def _select_indices(self, columns=None):
+        if columns is None:
+            return None
+
+        schema = self.schema
+        indices = []
+        for col in columns:
+            if isinstance(col, Integral):
+                col = int(col)
+                if 0 <= col < len(schema):
+                    col = schema[col].name
+                else:
+                    raise ValueError("Column indices must be in 0 <= ind < %d,"
+                                     " got %d" % (len(schema), col))
+            if col in self._column_index_lookup:
+                indices.append(self._column_index_lookup[col])
+            else:
+                raise ValueError("Unknown column name %r" % col)
+
+        return indices
+
+    def read_stripe(self, n, columns=None):
+        """Read a single stripe from the file.
+
+        Parameters
+        ----------
+        n : int
+            The stripe index
+        columns : list
+            If not None, only these columns will be read from the stripe. A
+            column name may be a prefix of a nested field, e.g. 'a' will select
+            'a.b', 'a.c', and 'a.d.e'
+
+        Returns
+        -------
+        pyarrow.lib.RecordBatch
+            Content of the stripe as a RecordBatch.
+        """
+        include_indices = self._select_indices(columns)
+        return self.reader.read_stripe(n, include_indices=include_indices)
+
+    def read(self, columns=None):
+        """Read the whole file.
+
+        Parameters
+        ----------
+        columns : list
+            If not None, only these columns will be read from the file. A
+            column name may be a prefix of a nested field, e.g. 'a' will select
+            'a.b', 'a.c', and 'a.d.e'
+
+        Returns
+        -------
+        pyarrow.lib.Table
+            Content of the file as a Table.
+        """
+        include_indices = self._select_indices(columns)
+        return self.reader.read(include_indices=include_indices)
diff --git a/python/setup.py b/python/setup.py
index 32e76ab05..3d3831dc0 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -84,6 +84,7 @@ def run(self):
                      ('with-parquet', None, 'build the Parquet extension'),
                      ('with-static-parquet', None, 'link parquet statically'),
                      ('with-plasma', None, 'build the Plasma extension'),
+                     ('with-orc', None, 'build the ORC extension'),
                      ('bundle-arrow-cpp', None,
                       'bundle the Arrow C++ libraries')] +
                     _build_ext.user_options)
@@ -109,12 +110,15 @@ def initialize_options(self):
             os.environ.get('PYARROW_WITH_STATIC_BOOST', '1'))
         self.with_plasma = strtobool(
             os.environ.get('PYARROW_WITH_PLASMA', '0'))
+        self.with_orc = strtobool(
+            os.environ.get('PYARROW_WITH_ORC', '0'))
         self.bundle_arrow_cpp = strtobool(
             os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0'))
 
     CYTHON_MODULE_NAMES = [
         'lib',
         '_parquet',
+        '_orc',
         'plasma']
 
     def _run_cmake(self):
@@ -157,6 +161,9 @@ def _run_cmake(self):
         if self.with_plasma:
             cmake_options.append('-DPYARROW_BUILD_PLASMA=on')
 
+        if self.with_orc:
+            cmake_options.append('-DPYARROW_BUILD_ORC=on')
+
         if len(self.cmake_cxxflags) > 0:
             cmake_options.append('-DPYARROW_CXXFLAGS="{0}"'
                                  .format(self.cmake_cxxflags))
@@ -284,6 +291,8 @@ def _failure_permitted(self, name):
             return True
         if name == 'plasma' and not self.with_plasma:
             return True
+        if name == '_orc' and not self.with_orc:
+            return True
         return False
 
     def _get_inplace_dir(self):


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Add support for reading ORC files
> ---------------------------------
>
>                 Key: ARROW-1920
>                 URL: https://issues.apache.org/jira/browse/ARROW-1920
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: C++, Python
>            Reporter: Jim Crist
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Would be nice to be able to read ORC files in pyarrow, similar to the already 
> existing parquet support.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1920) Add support for reading ORC files

Reply via email to