This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3bddb01 ARROW-7288: [C++][Parquet] Don't use regular expression to
parse application version
3bddb01 is described below
commit 3bddb0132d5e6e18b23fe39c5724c3cd90ca4b62
Author: Sutou Kouhei <[email protected]>
AuthorDate: Tue Feb 2 13:15:51 2021 +0100
ARROW-7288: [C++][Parquet] Don't use regular expression to parse
application version
std::regex provided by MinGW may take a long with Japanese location on
Windows.
We can use std::regex, boost::regex or RE2 as regular expression
engine for this but RE2 doesn't use compatible syntax with others. If
we support all of them, we need to maintain multiple regular
expressions. It increases maintenance cost. If we don't use regular
expression, we don't need to think about regular expression. But we
need to maintain hand-written parser.
Closes #9367 from kou/cpp-parquet-no-regex
Lead-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
ci/docker/python-wheel-manylinux-201x.dockerfile | 1 -
ci/docker/python-wheel-windows-vs2017.dockerfile | 1 -
ci/docker/ubuntu-18.04-cpp.dockerfile | 1 -
ci/docker/ubuntu-20.04-cpp.dockerfile | 1 -
cpp/build-support/trim-boost.sh | 2 -
cpp/cmake_modules/FindBoostAlt.cmake | 4 +-
cpp/cmake_modules/ThirdpartyToolchain.cmake | 17 +-
cpp/src/parquet/CMakeLists.txt | 12 +-
cpp/src/parquet/metadata.cc | 378 +++++++++++++++++----
cpp/src/parquet/metadata.h | 15 +-
cpp/src/parquet/metadata_test.cc | 209 ++++++++++++
.../apache-arrow/apt/debian-buster/Dockerfile | 1 -
.../apache-arrow/apt/ubuntu-bionic/Dockerfile | 1 -
.../apache-arrow/apt/ubuntu-focal/Dockerfile | 1 -
.../apache-arrow/apt/ubuntu-groovy/Dockerfile | 1 -
.../apache-arrow/apt/ubuntu-xenial/Dockerfile | 1 -
.../apache-arrow/debian.ubuntu-xenial/control | 1 -
.../linux-packages/apache-arrow/debian/control.in | 1 -
.../linux-packages/apache-arrow/yum/arrow.spec.in | 1 -
19 files changed, 520 insertions(+), 129 deletions(-)
diff --git a/ci/docker/python-wheel-manylinux-201x.dockerfile
b/ci/docker/python-wheel-manylinux-201x.dockerfile
index 0a17d86..4be0c97 100644
--- a/ci/docker/python-wheel-manylinux-201x.dockerfile
+++ b/ci/docker/python-wheel-manylinux-201x.dockerfile
@@ -70,7 +70,6 @@ RUN vcpkg install --clean-after-build \
abseil \
aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer] \
boost-filesystem \
- boost-regex \
brotli \
bzip2 \
c-ares \
diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile
b/ci/docker/python-wheel-windows-vs2017.dockerfile
index 50372bd..ecd58b4 100644
--- a/ci/docker/python-wheel-windows-vs2017.dockerfile
+++ b/ci/docker/python-wheel-windows-vs2017.dockerfile
@@ -52,7 +52,6 @@ RUN vcpkg install --clean-after-build \
aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer] \
boost-filesystem \
boost-multiprecision \
- boost-regex \
boost-system \
brotli \
bzip2 \
diff --git a/ci/docker/ubuntu-18.04-cpp.dockerfile
b/ci/docker/ubuntu-18.04-cpp.dockerfile
index dfe8666..4b855b5 100644
--- a/ci/docker/ubuntu-18.04-cpp.dockerfile
+++ b/ci/docker/ubuntu-18.04-cpp.dockerfile
@@ -66,7 +66,6 @@ RUN apt-get update -y -q && \
git \
libbenchmark-dev \
libboost-filesystem-dev \
- libboost-regex-dev \
libboost-system-dev \
libbrotli-dev \
libbz2-dev \
diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile
b/ci/docker/ubuntu-20.04-cpp.dockerfile
index fbcda44..3a37ace 100644
--- a/ci/docker/ubuntu-20.04-cpp.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp.dockerfile
@@ -68,7 +68,6 @@ RUN apt-get update -y -q && \
git \
libbenchmark-dev \
libboost-filesystem-dev \
- libboost-regex-dev \
libboost-system-dev \
libbrotli-dev \
libbz2-dev \
diff --git a/cpp/build-support/trim-boost.sh b/cpp/build-support/trim-boost.sh
index 7ffadc4..ebc4ccd 100755
--- a/cpp/build-support/trim-boost.sh
+++ b/cpp/build-support/trim-boost.sh
@@ -35,8 +35,6 @@ set -eu
BOOST_LIBS="system.hpp filesystem.hpp"
# Add these to be able to build those
BOOST_LIBS="$BOOST_LIBS config build boost_install headers log predef"
-# Parquet needs this (if using gcc < 4.9)
-BOOST_LIBS="$BOOST_LIBS regex.hpp"
# Gandiva needs these
BOOST_LIBS="$BOOST_LIBS multiprecision/cpp_int.hpp"
# These are for Thrift when Thrift_SOURCE=BUNDLED
diff --git a/cpp/cmake_modules/FindBoostAlt.cmake
b/cpp/cmake_modules/FindBoostAlt.cmake
index 300080d..123c6dd 100644
--- a/cpp/cmake_modules/FindBoostAlt.cmake
+++ b/cpp/cmake_modules/FindBoostAlt.cmake
@@ -39,7 +39,7 @@ if(ARROW_BOOST_USE_SHARED)
set(BUILD_SHARED_LIBS ON)
find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS}
- COMPONENTS regex system filesystem)
+ COMPONENTS system filesystem)
set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP})
unset(BUILD_SHARED_LIBS_KEEP)
else()
@@ -47,7 +47,7 @@ else()
# TODO Differentiate here between release and debug builds
set(Boost_USE_STATIC_LIBS ON)
find_package(Boost ${BoostAlt_FIND_VERSION_OPTIONS}
- COMPONENTS regex system filesystem)
+ COMPONENTS system filesystem)
endif()
if(Boost_FOUND)
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 961edac..71d91a4 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -760,10 +760,8 @@ macro(build_boost)
)
set(BOOST_SYSTEM_LIBRARY boost_system_static)
set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static)
- set(BOOST_REGEX_LIBRARY boost_regex_static)
set(BOOST_BUILD_PRODUCTS ${BOOST_STATIC_SYSTEM_LIBRARY}
- ${BOOST_STATIC_FILESYSTEM_LIBRARY}
- ${BOOST_STATIC_REGEX_LIBRARY})
+ ${BOOST_STATIC_FILESYSTEM_LIBRARY})
add_thirdparty_lib(boost_system STATIC_LIB
"${BOOST_STATIC_SYSTEM_LIBRARY}")
@@ -854,13 +852,6 @@ else()
set(THRIFT_REQUIRES_BOOST FALSE)
endif()
-# Parquet requires boost only with gcc 4.8 (because of missing std::regex).
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION
VERSION_LESS "4.9")
- set(PARQUET_REQUIRES_BOOST TRUE)
-else()
- set(PARQUET_REQUIRES_BOOST FALSE)
-endif()
-
# Compilers that don't support int128_t have a compile-time
# (header-only) dependency on Boost for int128_t.
if(ARROW_USE_UBSAN)
@@ -878,8 +869,7 @@ endif()
if(ARROW_BUILD_INTEGRATION
OR ARROW_BUILD_TESTS
OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS)
- OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)
- OR (ARROW_PARQUET AND PARQUET_REQUIRES_BOOST))
+ OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS))
set(ARROW_BOOST_REQUIRED TRUE)
set(ARROW_BOOST_REQUIRE_LIBRARY TRUE)
elseif(ARROW_GANDIVA
@@ -904,15 +894,12 @@ if(ARROW_BOOST_REQUIRED)
if(TARGET Boost::system)
set(BOOST_SYSTEM_LIBRARY Boost::system)
set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem)
- set(BOOST_REGEX_LIBRARY Boost::regex)
elseif(BoostAlt_FOUND)
set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY})
set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY})
- set(BOOST_REGEX_LIBRARY ${Boost_REGEX_LIBRARY})
else()
set(BOOST_SYSTEM_LIBRARY boost_system_static)
set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static)
- set(BOOST_REGEX_LIBRARY boost_regex_static)
endif()
set(ARROW_BOOST_LIBS ${BOOST_SYSTEM_LIBRARY} ${BOOST_FILESYSTEM_LIBRARY})
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 03443b2..2c7988b 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -129,13 +129,6 @@ else()
set(ARROW_LIBRARIES_FOR_STATIC_TESTS arrow_testing_shared arrow_shared)
endif()
-set(PARQUET_BOOST_LINK_LIBS)
-
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION
VERSION_LESS "4.9")
- add_definitions(-DPARQUET_USE_BOOST_REGEX)
- list(APPEND PARQUET_BOOST_LINK_LIBS ${BOOST_REGEX_LIBRARY})
-endif()
-
set(PARQUET_MIN_TEST_LIBS GTest::gtest_main GTest::gtest)
if(APPLE)
@@ -236,12 +229,11 @@ if(NOT PARQUET_MINIMAL_DEPENDENCY)
# These are libraries that we will link privately with parquet_shared (as
they
# do not need to be linked transitively by other linkers)
- set(PARQUET_SHARED_PRIVATE_LINK_LIBS ${PARQUET_BOOST_LINK_LIBS}
thrift::thrift)
+ set(PARQUET_SHARED_PRIVATE_LINK_LIBS thrift::thrift)
# Link publicly with parquet_static (because internal users need to
# transitively link all dependencies)
- set(PARQUET_STATIC_LINK_LIBS ${PARQUET_STATIC_LINK_LIBS}
${PARQUET_BOOST_LINK_LIBS}
- thrift::thrift)
+ set(PARQUET_STATIC_LINK_LIBS ${PARQUET_STATIC_LINK_LIBS} thrift::thrift)
# Although we don't link parquet_objlib against anything, we need it to
depend
# on these libs as we may generate their headers via ExternalProject_Add
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 34803f5..7d239f2 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -25,6 +25,7 @@
#include <vector>
#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
#include "parquet/encryption_internal.h"
#include "parquet/exception.h"
#include "parquet/internal_file_decryptor.h"
@@ -33,45 +34,6 @@
#include "parquet/statistics.h"
#include "parquet/thrift_internal.h"
-// ARROW-6096: The boost regex library must be used when compiling with gcc <
4.9
-#if defined(PARQUET_USE_BOOST_REGEX)
-#include <boost/regex.hpp> // IWYU pragma: keep
-using ::boost::regex;
-using ::boost::smatch;
-
-template <typename... Args>
-static bool regex_match(Args&&... args) {
- try {
- return boost::regex_match(std::forward<Args>(args)...);
- } catch (const boost::regex_error& e) {
- if (e.code() == boost::regex_constants::error_complexity ||
- e.code() == boost::regex_constants::error_stack) {
- // Input-dependent error => return as if matching failed
- return false;
- }
- throw;
- }
-}
-#else
-#include <regex>
-using ::std::regex;
-using ::std::smatch;
-
-template <typename... Args>
-static bool regex_match(Args&&... args) {
- try {
- return std::regex_match(std::forward<Args>(args)...);
- } catch (const std::regex_error& e) {
- if (e.code() == std::regex_constants::error_complexity ||
- e.code() == std::regex_constants::error_stack) {
- // Input-dependent error => return as if matching failed
- return false;
- }
- throw;
- }
-}
-#endif
-
namespace parquet {
const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() {
@@ -949,43 +911,309 @@ ApplicationVersion::ApplicationVersion(std::string
application, int major, int m
int patch)
: application_(std::move(application)), version{major, minor, patch, "",
"", ""} {}
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
- // Use singletons to compile only once (ARROW-9863)
- static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
- static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
- smatch app_matches;
- smatch ver_matches;
-
- std::string created_by_lower = created_by;
- std::transform(created_by_lower.begin(), created_by_lower.end(),
- created_by_lower.begin(), ::tolower);
-
- bool app_success = regex_match(created_by_lower, app_matches, app_regex);
- bool ver_success = false;
- std::string version_str;
-
- if (app_success && app_matches.size() >= 4) {
- // first match is the entire string. sub-matches start from second.
- application_ = app_matches[1];
- version_str = app_matches[3];
- build_ = app_matches[4];
- ver_success = regex_match(version_str, ver_matches, ver_regex);
- } else {
- application_ = "unknown";
- }
-
- if (ver_success && ver_matches.size() >= 7) {
- version.major = atoi(ver_matches[1].str().c_str());
- version.minor = atoi(ver_matches[2].str().c_str());
- version.patch = atoi(ver_matches[3].str().c_str());
- version.unknown = ver_matches[4].str();
- version.pre_release = ver_matches[5].str();
- version.build_info = ver_matches[6].str();
- } else {
- version.major = 0;
- version.minor = 0;
- version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+// *
https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+// *
https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+// "${APPLICATION_NAME}"
+// "${APPLICATION_NAME} version ${VERSION}"
+// "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+// parquet-cpp
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+// "${MAJOR}"
+// "${MAJOR}.${MINOR}"
+// "${MAJOR}.${MINOR}.${PATCH}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+// 1
+// 1.5
+// 1.5.0
+// 1.5.0ab
+// 1.5.0ab-cdh5.5.0
+// 1.5.0ab-cdh5.5.0+cd
+// 1.5.0ab+cd
+// 1.5.0-cdh5.5.0
+// 1.5.0-cdh5.5.0+cd
+// 1.5.0+cd
+class ApplicationVersionParser {
+ public:
+ ApplicationVersionParser(const std::string& created_by,
+ ApplicationVersion& application_version)
+ : created_by_(created_by),
+ application_version_(application_version),
+ spaces_(" \t\v\r\n\f"),
+ digits_("0123456789") {}
+
+ void Parse() {
+ application_version_.application_ = "unknown";
+ application_version_.version = {0, 0, 0, "", "", ""};
+
+ if (!ParseApplicationName()) {
+ return;
+ }
+ if (!ParseVersion()) {
+ return;
+ }
+ if (!ParseBuildName()) {
+ return;
+ }
+ }
+
+ private:
+ bool IsSpace(const std::string& string, const size_t& offset) {
+ auto target = ::arrow::util::string_view(string).substr(offset, 1);
+ return target.find_first_of(spaces_) != ::arrow::util::string_view::npos;
+ }
+
+ void RemovePrecedingSpaces(const std::string& string, size_t& start,
+ const size_t& end) {
+ while (start < end && IsSpace(string, start)) {
+ ++start;
+ }
+ }
+
+ void RemoveTrailingSpaces(const std::string& string, const size_t& start,
size_t& end) {
+ while (start < (end - 1) && (end - 1) < string.size() && IsSpace(string,
end - 1)) {
+ --end;
+ }
+ }
+
+ bool ParseApplicationName() {
+ std::string version_mark(" version ");
+ auto version_mark_position = created_by_.find(version_mark);
+ size_t application_name_end;
+ // No VERSION and BUILD_NAME.
+ if (version_mark_position == std::string::npos) {
+ version_start_ = std::string::npos;
+ application_name_end = created_by_.size();
+ } else {
+ version_start_ = version_mark_position + version_mark.size();
+ application_name_end = version_mark_position;
+ }
+
+ size_t application_name_start = 0;
+ RemovePrecedingSpaces(created_by_, application_name_start,
application_name_end);
+ RemoveTrailingSpaces(created_by_, application_name_start,
application_name_end);
+ application_version_.application_ = created_by_.substr(
+ application_name_start, application_name_end - application_name_start);
+
+ return true;
}
+
+ bool ParseVersion() {
+ // No VERSION.
+ if (version_start_ == std::string::npos) {
+ return false;
+ }
+
+ RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+ version_end_ = created_by_.find(" (", version_start_);
+ // No BUILD_NAME.
+ if (version_end_ == std::string::npos) {
+ version_end_ = created_by_.size();
+ }
+ RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+ // No VERSION.
+ if (version_start_ == version_end_) {
+ return false;
+ }
+ version_string_ = created_by_.substr(version_start_, version_end_ -
version_start_);
+
+ if (!ParseVersionMajor()) {
+ return false;
+ }
+ if (!ParseVersionMinor()) {
+ return false;
+ }
+ if (!ParseVersionPatch()) {
+ return false;
+ }
+ if (!ParseVersionUnknown()) {
+ return false;
+ }
+ if (!ParseVersionPreRelease()) {
+ return false;
+ }
+ if (!ParseVersionBuildInfo()) {
+ return false;
+ }
+
+ return true;
+ }
+
+ bool ParseVersionMajor() {
+ size_t version_major_start = 0;
+ auto version_major_end = version_string_.find_first_not_of(digits_);
+ // MAJOR only.
+ if (version_major_end == std::string::npos) {
+ version_major_end = version_string_.size();
+ version_parsing_position_ = version_major_end;
+ } else {
+ // No ".".
+ if (version_string_[version_major_end] != '.') {
+ return false;
+ }
+ // No MAJOR.
+ if (version_major_end == version_major_start) {
+ return false;
+ }
+ version_parsing_position_ = version_major_end + 1; // +1 is for '.'.
+ }
+ auto version_major_string = version_string_.substr(
+ version_major_start, version_major_end - version_major_start);
+ application_version_.version.major = atoi(version_major_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionMinor() {
+ auto version_minor_start = version_parsing_position_;
+ auto version_minor_end =
+ version_string_.find_first_not_of(digits_, version_minor_start);
+ // MAJOR.MINOR only.
+ if (version_minor_end == std::string::npos) {
+ version_minor_end = version_string_.size();
+ version_parsing_position_ = version_minor_end;
+ } else {
+ // No ".".
+ if (version_string_[version_minor_end] != '.') {
+ return false;
+ }
+ // No MINOR.
+ if (version_minor_end == version_minor_start) {
+ return false;
+ }
+ version_parsing_position_ = version_minor_end + 1; // +1 is for '.'.
+ }
+ auto version_minor_string = version_string_.substr(
+ version_minor_start, version_minor_end - version_minor_start);
+ application_version_.version.minor = atoi(version_minor_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionPatch() {
+ auto version_patch_start = version_parsing_position_;
+ auto version_patch_end =
+ version_string_.find_first_not_of(digits_, version_patch_start);
+ // No UNKNOWN, PRE_RELEASE and BUILD_INFO.
+ if (version_patch_end == std::string::npos) {
+ version_patch_end = version_string_.size();
+ }
+ // No PATCH.
+ if (version_patch_end == version_patch_start) {
+ return false;
+ }
+ auto version_patch_string = version_string_.substr(
+ version_patch_start, version_patch_end - version_patch_start);
+ application_version_.version.patch = atoi(version_patch_string.c_str());
+ version_parsing_position_ = version_patch_end;
+ return true;
+ }
+
+ bool ParseVersionUnknown() {
+ // No UNKNOWN.
+ if (version_parsing_position_ == version_string_.size()) {
+ return true;
+ }
+ auto version_unknown_start = version_parsing_position_;
+ auto version_unknown_end = version_string_.find_first_of("-+",
version_unknown_start);
+ // No PRE_RELEASE and BUILD_INFO
+ if (version_unknown_end == std::string::npos) {
+ version_unknown_end = version_string_.size();
+ }
+ application_version_.version.unknown = version_string_.substr(
+ version_unknown_start, version_unknown_end - version_unknown_start);
+ version_parsing_position_ = version_unknown_end;
+ return true;
+ }
+
+ bool ParseVersionPreRelease() {
+ // No PRE_RELEASE.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '-') {
+ return true;
+ }
+
+ auto version_pre_release_start = version_parsing_position_ + 1; // +1 is
for '-'.
+ auto version_pre_release_end =
+ version_string_.find_first_of("+", version_pre_release_start);
+ // No BUILD_INFO
+ if (version_pre_release_end == std::string::npos) {
+ version_pre_release_end = version_string_.size();
+ }
+ application_version_.version.pre_release = version_string_.substr(
+ version_pre_release_start, version_pre_release_end -
version_pre_release_start);
+ version_parsing_position_ = version_pre_release_end;
+ return true;
+ }
+
+ bool ParseVersionBuildInfo() {
+ // No BUILD_INFO.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '+') {
+ return true;
+ }
+
+ auto version_build_info_start = version_parsing_position_ + 1; // +1 is
for '+'.
+ application_version_.version.build_info =
+ version_string_.substr(version_build_info_start);
+ return true;
+ }
+
+ bool ParseBuildName() {
+ std::string build_mark(" (build ");
+ auto build_mark_position = created_by_.find(build_mark, version_end_);
+ // No BUILD_NAME.
+ if (build_mark_position == std::string::npos) {
+ return false;
+ }
+ auto build_name_start = build_mark_position + build_mark.size();
+ RemovePrecedingSpaces(created_by_, build_name_start, created_by_.size());
+ auto build_name_end = created_by_.find_first_of(")", build_name_start);
+ // No end ")".
+ if (build_name_end == std::string::npos) {
+ return false;
+ }
+ RemoveTrailingSpaces(created_by_, build_name_start, build_name_end);
+ application_version_.build_ =
+ created_by_.substr(build_name_start, build_name_end -
build_name_start);
+
+ return true;
+ }
+
+ const std::string& created_by_;
+ ApplicationVersion& application_version_;
+
+ // For parsing.
+ std::string spaces_;
+ std::string digits_;
+ size_t version_parsing_position_;
+ size_t version_start_;
+ size_t version_end_;
+ std::string version_string_;
+};
+} // namespace
+
+ApplicationVersion::ApplicationVersion(const std::string& created_by) {
+ ApplicationVersionParser parser(created_by, *this);
+ parser.Parse();
}
bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version)
const {
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index b487cf4..7152458 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -58,16 +58,6 @@ class PARQUET_EXPORT ApplicationVersion {
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
- // Regular expression for the version format
- // major . minor . patch unknown - prerelease.x + build info
- // Eg: 1.5.0ab-cdh5.5.0+cd
- static constexpr char const* VERSION_FORMAT =
- "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
- // Regular expression for the application format
- // application_name version VERSION_FORMAT (build build_name)
- // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
- static constexpr char const* APPLICATION_FORMAT =
-
"(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";
// Application that wrote the file. e.g. "IMPALA"
std::string application_;
@@ -77,9 +67,8 @@ class PARQUET_EXPORT ApplicationVersion {
// Version of the application that wrote the file, expressed as
// (<major>.<minor>.<patch>). Unmatched parts default to 0.
// "1.2.3" => {1, 2, 3}
- // "1.2" => {0, 0, 0}
- // "1.2-cdh5" => {0, 0, 0}
- // TODO (majetideepak): Implement support for pre_release
+ // "1.2" => {1, 2, 0}
+ // "1.2-cdh5" => {1, 2, 0}
struct {
int major;
int minor;
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
index 027ed03..dbd021a 100644
--- a/cpp/src/parquet/metadata_test.cc
+++ b/cpp/src/parquet/metadata_test.cc
@@ -342,5 +342,214 @@ TEST(ApplicationVersion, Basics) {
version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int,
SortOrder::UNSIGNED));
}
+TEST(ApplicationVersion, Empty) {
+ ApplicationVersion version("");
+
+ ASSERT_EQ("", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(0, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, NoVersion) {
+ ApplicationVersion version("parquet-mr (build abcd)");
+
+ ASSERT_EQ("parquet-mr (build abcd)", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(0, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionEmpty) {
+ ApplicationVersion version("parquet-mr version ");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(0, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMajor) {
+ ApplicationVersion version("parquet-mr version .");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(0, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionInvalidMajor) {
+ ApplicationVersion version("parquet-mr version x1");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(0, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionMajorOnly) {
+ ApplicationVersion version("parquet-mr version 1");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMinor) {
+ ApplicationVersion version("parquet-mr version 1.");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionMajorMinorOnly) {
+ ApplicationVersion version("parquet-mr version 1.7");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(7, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionInvalidMinor) {
+ ApplicationVersion version("parquet-mr version 1.x7");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(0, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPatch) {
+ ApplicationVersion version("parquet-mr version 1.7.");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(7, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionInvalidPatch) {
+ ApplicationVersion version("parquet-mr version 1.7.x9");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(7, version.version.minor);
+ ASSERT_EQ(0, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknown) {
+ ApplicationVersion version("parquet-mr version 1.7.9-cdh5.5.0+cd");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(7, version.version.minor);
+ ASSERT_EQ(9, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+ ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPreRelease) {
+ ApplicationVersion version("parquet-mr version 1.7.9ab+cd");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(7, version.version.minor);
+ ASSERT_EQ(9, version.version.patch);
+ ASSERT_EQ("ab", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknownNoPreRelease) {
+ ApplicationVersion version("parquet-mr version 1.7.9+cd");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(7, version.version.minor);
+ ASSERT_EQ(9, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknownBuildInfoPreRelease) {
+ ApplicationVersion version("parquet-mr version 1.7.9+cd-cdh5.5.0");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(7, version.version.minor);
+ ASSERT_EQ(9, version.version.patch);
+ ASSERT_EQ("", version.version.unknown);
+ ASSERT_EQ("", version.version.pre_release);
+ ASSERT_EQ("cd-cdh5.5.0", version.version.build_info);
+}
+
+TEST(ApplicationVersion, FullWithSpaces) {
+ ApplicationVersion version(
+ " parquet-mr \t version \v 1.5.3ab-cdh5.5.0+cd \r (build \n abcd \f) ");
+
+ ASSERT_EQ("parquet-mr", version.application_);
+ ASSERT_EQ("abcd", version.build_);
+ ASSERT_EQ(1, version.version.major);
+ ASSERT_EQ(5, version.version.minor);
+ ASSERT_EQ(3, version.version.patch);
+ ASSERT_EQ("ab", version.version.unknown);
+ ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+ ASSERT_EQ("cd", version.version.build_info);
+}
+
} // namespace metadata
} // namespace parquet
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile
b/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile
index c6268f6..20ddef1 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile
@@ -46,7 +46,6 @@ RUN \
git \
gtk-doc-tools \
libboost-filesystem-dev \
- libboost-regex-dev \
libboost-system-dev \
libbrotli-dev \
libbz2-dev \
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile
b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile
index 79292cb..df51885 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile
@@ -41,7 +41,6 @@ RUN \
git \
gtk-doc-tools \
libboost-filesystem-dev \
- libboost-regex-dev \
libboost-system-dev \
libbrotli-dev \
libbz2-dev \
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile
b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile
index 898e64a..c1404da 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile
@@ -41,7 +41,6 @@ RUN \
git \
gtk-doc-tools \
libboost-filesystem-dev \
- libboost-regex-dev \
libboost-system-dev \
libbrotli-dev \
libbz2-dev \
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile
b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile
index 449d134..7b9cf7e 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile
@@ -41,7 +41,6 @@ RUN \
git \
gtk-doc-tools \
libboost-filesystem-dev \
- libboost-regex-dev \
libboost-system-dev \
libbrotli-dev \
libbz2-dev \
diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-xenial/Dockerfile
b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-xenial/Dockerfile
index 9e38399..15ed0fe 100644
--- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-xenial/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-xenial/Dockerfile
@@ -43,7 +43,6 @@ RUN \
git \
gtk-doc-tools \
libboost-filesystem-dev \
- libboost-regex-dev \
libboost-system-dev \
libbrotli-dev \
libbz2-dev \
diff --git a/dev/tasks/linux-packages/apache-arrow/debian.ubuntu-xenial/control
b/dev/tasks/linux-packages/apache-arrow/debian.ubuntu-xenial/control
index 2486f21..a70aa13 100644
--- a/dev/tasks/linux-packages/apache-arrow/debian.ubuntu-xenial/control
+++ b/dev/tasks/linux-packages/apache-arrow/debian.ubuntu-xenial/control
@@ -11,7 +11,6 @@ Build-Depends:
gobject-introspection,
gtk-doc-tools,
libboost-filesystem-dev,
- libboost-regex-dev,
libboost-system-dev,
libbrotli-dev,
libbz2-dev,
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/control.in
b/dev/tasks/linux-packages/apache-arrow/debian/control.in
index c2251a5..86eca72 100644
--- a/dev/tasks/linux-packages/apache-arrow/debian/control.in
+++ b/dev/tasks/linux-packages/apache-arrow/debian/control.in
@@ -9,7 +9,6 @@ Build-Depends:
gobject-introspection,
gtk-doc-tools,
libboost-filesystem-dev,
- libboost-regex-dev,
libboost-system-dev,
libbrotli-dev,
libbz2-dev,
diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
index a69e10e..0e248d6 100644
--- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
+++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
@@ -206,7 +206,6 @@ Summary: Runtime libraries for Apache Arrow C++
License: Apache-2.0
Requires: boost%{boost_version}-system
Requires: boost%{boost_version}-filesystem
-Requires: boost%{boost_version}-regex
Requires: brotli
Requires: gflags
Requires: glog