pitrou commented on code in PR #40697:
URL: https://github.com/apache/arrow/pull/40697#discussion_r1537834693
##########
cpp/src/arrow/adapters/orc/adapter.cc:
##########
@@ -183,8 +184,40 @@ liborc::RowReaderOptions default_row_reader_options() {
return options;
}
+// Proactively check timezone database availability for ORC versions older
than 2.0.0
+Status check_timezone_database_availability() {
+ if (OrcVersion::Get().value_or(OrcVersion{0, 0, 0}).major >= 2) {
+ return Status::OK();
+ }
+ auto tz_dir = std::getenv("TZDIR");
+ bool is_tzdb_avaiable = tz_dir != nullptr
+ ? std::filesystem::exists(tz_dir)
+ : std::filesystem::exists("/usr/share/zoneinfo");
+ if (!is_tzdb_avaiable) {
+ return Status::Invalid(
+ "IANA time zone database is unavailable but required by ORC."
+ " Please install it to /usr/share/zoneinfo or set TZDIR env to the
installed"
+ " directory");
+ }
+ return Status::OK();
+}
+
} // namespace
+std::optional<OrcVersion> OrcVersion::Get() {
+ std::vector<int> versions;
+ std::stringstream ss{ORC_VERSION};
+ while (ss.good()) {
+ std::string str;
+ std::getline(ss, str, '.');
+ versions.emplace_back(std::stoi(str));
+ }
+ if (versions.size() != 3) {
Review Comment:
So if a future ORC versions advertises e.g. "2.1.0.1", this will fail?
Sounds inflexible while we're only concerned about the major number.
##########
cpp/src/arrow/adapters/orc/adapter.h:
##########
@@ -47,6 +48,16 @@ struct StripeInformation {
int64_t first_row_id;
};
+/// \brief Version provided by the official ORC C++ library.
+struct ARROW_EXPORT OrcVersion {
+ int major;
+ int minor;
+ int patch;
+
+ /// \brief Return the current version of ORC C++ library.
+ static std::optional<OrcVersion> Get();
Review Comment:
Assuming the ORC_VERSION macro always exists, shouldn't we ensure that this
always returns successfully?
##########
cpp/src/arrow/adapters/orc/adapter_test.cc:
##########
@@ -636,6 +638,35 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) {
AssertSchemaEqual(schema, read_schema, /*check_metadata=*/true);
}
+TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) {
+ auto orc_version = adapters::orc::OrcVersion::Get();
+ if (orc_version.has_value() && orc_version->major >= 2) {
+ GTEST_SKIP() << "Only ORC pre-2.0.0 versions have the time zone database
check";
+ }
+
+ // Backup the original TZDIR env and set a wrong value by purpose to trigger
the check.
+ const char* tzdir_env_key = "TZDIR";
+ const char* expect_str = "IANA time zone database is unavailable but
required by ORC";
+ auto tzdir_env_backup = std::getenv(tzdir_env_key);
+ ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/a/b/c/d/e"));
Review Comment:
Please use `EnvVarGuard` so that the value doesn't leak if this test fails.
##########
cpp/src/arrow/adapters/orc/adapter.cc:
##########
@@ -183,8 +184,40 @@ liborc::RowReaderOptions default_row_reader_options() {
return options;
}
+// Proactively check timezone database availability for ORC versions older
than 2.0.0
+Status check_timezone_database_availability() {
+ if (OrcVersion::Get().value_or(OrcVersion{0, 0, 0}).major >= 2) {
+ return Status::OK();
+ }
+ auto tz_dir = std::getenv("TZDIR");
+ bool is_tzdb_avaiable = tz_dir != nullptr
+ ? std::filesystem::exists(tz_dir)
+ : std::filesystem::exists("/usr/share/zoneinfo");
+ if (!is_tzdb_avaiable) {
+ return Status::Invalid(
+ "IANA time zone database is unavailable but required by ORC."
+ " Please install it to /usr/share/zoneinfo or set TZDIR env to the
installed"
+ " directory");
+ }
+ return Status::OK();
+}
+
} // namespace
+std::optional<OrcVersion> OrcVersion::Get() {
+ std::vector<int> versions;
+ std::stringstream ss{ORC_VERSION};
+ while (ss.good()) {
+ std::string str;
+ std::getline(ss, str, '.');
+ versions.emplace_back(std::stoi(str));
+ }
+ if (versions.size() != 3) {
+ return std::nullopt;
+ }
+ return OrcVersion{versions[0], versions[1], versions[2]};
Review Comment:
I'm curious: ORC defines a `OrcVersion` class but doesn't provide any API to
query it?
##########
cpp/src/arrow/adapters/orc/adapter.cc:
##########
@@ -183,8 +184,40 @@ liborc::RowReaderOptions default_row_reader_options() {
return options;
}
+// Proactively check timezone database availability for ORC versions older
than 2.0.0
+Status check_timezone_database_availability() {
Review Comment:
Should use `CamelCase` for function names.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]