This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 44bd3de94 ORC-1688: [C++] Do not access TZDB if there is no timestamp 
type
44bd3de94 is described below

commit 44bd3de94c5d686cf994a3af1447a5c89ee89b8d
Author: ffacs <ffacs@ffacs...@gmail.com>
AuthorDate: Tue Apr 23 10:33:24 2024 +0800

    ORC-1688: [C++] Do not access TZDB if there is no timestamp type
    
    ### What changes were proposed in this pull request?
    1. Don't write writer's Time Zone name when there are no timestamp types.
    2. Do not access TZDB if there is no timestamp type.
    
    ### Why are the changes needed?
    This patch could accelerate reading by reducing unnecessary IO.
    
    ### How was this patch tested?
    All UT passed.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    NO
    
    Closes #1893 from ffacs/ORC-1688.
    
    Authored-by: ffacs <ffacs@ffacs...@gmail.com>
    Signed-off-by: Gang Wu <ust...@gmail.com>
    (cherry picked from commit d13e56922e802f6d0d0ad649d16bc2177e0478db)
    Signed-off-by: Gang Wu <ust...@gmail.com>
---
 c++/src/Timezone.cc      | 71 +++++++++++++++++++++++++++++++++++++++---------
 c++/test/TestTimezone.cc |  2 +-
 c++/test/TestWriter.cc   | 46 +++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc
index ebaf222fc..4c78a53a2 100644
--- a/c++/src/Timezone.cc
+++ b/c++/src/Timezone.cc
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#include <atomic>
 #include <map>
 #include <sstream>
 
@@ -671,17 +672,8 @@ namespace orc {
     return dir;
   }
 
-  /**
-   * Get a timezone by absolute filename.
-   * Results are cached.
-   */
-  const Timezone& getTimezoneByFilename(const std::string& filename) {
-    // ORC-110
-    std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
-    std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = 
timezoneCache.find(filename);
-    if (itr != timezoneCache.end()) {
-      return *(itr->second).get();
-    }
+  static std::vector<unsigned char> loadTZDB(const std::string& filename) {
+    std::vector<unsigned char> buffer;
     if (!fileExists(filename.c_str())) {
       std::stringstream ss;
       ss << "Time zone file " << filename << " does not exist."
@@ -691,12 +683,65 @@ namespace orc {
     try {
       std::unique_ptr<InputStream> file = readFile(filename);
       size_t size = static_cast<size_t>(file->getLength());
-      std::vector<unsigned char> buffer(size);
+      buffer.resize(size);
       file->read(&buffer[0], size, 0);
-      timezoneCache[filename] = std::make_shared<TimezoneImpl>(filename, 
buffer);
     } catch (ParseError& err) {
       throw TimezoneError(err.what());
     }
+    return buffer;
+  }
+
+  class LazyTimezone : public Timezone {
+   private:
+    std::string filename_;
+    mutable std::unique_ptr<TimezoneImpl> impl_;
+    mutable std::once_flag initialized_;
+
+    TimezoneImpl* getImpl() const {
+      std::call_once(initialized_, [&]() {
+        auto buffer = loadTZDB(filename_);
+        impl_ = std::make_unique<TimezoneImpl>(filename_, std::move(buffer));
+      });
+      return impl_.get();
+    }
+
+   public:
+    LazyTimezone(const std::string& filename) : filename_(filename) {}
+
+    const TimezoneVariant& getVariant(int64_t clk) const override {
+      return getImpl()->getVariant(clk);
+    }
+    int64_t getEpoch() const override {
+      return getImpl()->getEpoch();
+    }
+    void print(std::ostream& os) const override {
+      return getImpl()->print(os);
+    }
+    uint64_t getVersion() const override {
+      return getImpl()->getVersion();
+    }
+
+    int64_t convertToUTC(int64_t clk) const override {
+      return getImpl()->convertToUTC(clk);
+    }
+
+    int64_t convertFromUTC(int64_t clk) const override {
+      return getImpl()->convertFromUTC(clk);
+    }
+  };
+
+  /**
+   * Get a timezone by absolute filename.
+   * Results are cached.
+   */
+  const Timezone& getTimezoneByFilename(const std::string& filename) {
+    // ORC-110
+    std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
+    std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = 
timezoneCache.find(filename);
+    if (itr != timezoneCache.end()) {
+      return *(itr->second).get();
+    }
+    timezoneCache[filename] = std::make_shared<LazyTimezone>(filename);
     return *timezoneCache[filename].get();
   }
 
diff --git a/c++/test/TestTimezone.cc b/c++/test/TestTimezone.cc
index 727d6a276..94895cd70 100644
--- a/c++/test/TestTimezone.cc
+++ b/c++/test/TestTimezone.cc
@@ -431,7 +431,7 @@ namespace orc {
       ASSERT_TRUE(delEnv("TZDIR"));
     }
     ASSERT_TRUE(setEnv("TZDIR", "/path/to/wrong/tzdb"));
-    EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles"); },
+    EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles").getVersion(); 
},
                 testing::ThrowsMessage<TimezoneError>(testing::HasSubstr(
                     "Time zone file /path/to/wrong/tzdb/America/Los_Angeles 
does not exist."
                     " Please install IANA time zone database and set TZDIR 
env.")));
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index d160f82ff..0d6b368c0 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -2201,6 +2201,52 @@ namespace orc {
                  std::invalid_argument);
   }
 
+  TEST_P(WriterTest, testLazyLoadTZDB) {
+    MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+    MemoryPool* pool = getDefaultPool();
+    std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:int>"));
+
+    uint64_t stripeSize = 1024;            // 1K
+    uint64_t compressionBlockSize = 1024;  // 1k
+
+    std::unique_ptr<Writer> writer =
+        createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, 
*type, pool,
+                     &memStream, fileVersion, 0, "/ERROR/TIMEZONE");
+    std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(10);
+    StructVectorBatch* structBatch = 
dynamic_cast<StructVectorBatch*>(batch.get());
+    LongVectorBatch* longBatch = 
dynamic_cast<LongVectorBatch*>(structBatch->fields[0]);
+
+    for (uint64_t j = 0; j < 10; ++j) {
+      for (uint64_t i = 0; i < 10; ++i) {
+        longBatch->data[i] = static_cast<int64_t>(i);
+      }
+      structBatch->numElements = 10;
+      longBatch->numElements = 10;
+
+      writer->add(*batch);
+    }
+
+    writer->close();
+
+    auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), 
memStream.getLength());
+    std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
+    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), 
"/ERROR/TIMEZONE");
+    EXPECT_EQ(100, reader->getNumberOfRows());
+
+    batch = rowReader->createRowBatch(10);
+    for (uint64_t j = 0; j < 10; ++j) {
+      EXPECT_TRUE(rowReader->next(*batch));
+      EXPECT_EQ(10, batch->numElements);
+
+      for (uint64_t i = 0; i < 10; ++i) {
+        structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+        longBatch = dynamic_cast<LongVectorBatch*>(structBatch->fields[0]);
+        EXPECT_EQ(i, longBatch->data[i]);
+      }
+    }
+    EXPECT_FALSE(rowReader->next(*batch));
+  }
+
   INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest,
                            Values(FileVersion::v_0_11(), FileVersion::v_0_12(),
                                   FileVersion::UNSTABLE_PRE_2_0()));

Reply via email to