From d13e56922e802f6d0d0ad649d16bc2177e0478db Mon Sep 17 00:00:00 2001
From: ffacs <ffacs@ffacs520@gmail.com>
Date: Tue, 23 Apr 2024 10:33:24 +0800
Subject: [PATCH] ORC-1688: [C++] Do not access TZDB if there is no timestamp
 type

### What changes were proposed in this pull request?
1. Don't write writer's Time Zone name when there are no timestamp types.
2. Do not access TZDB if there is no timestamp type.

### Why are the changes needed?
This patch could accelerate reading by reducing unnecessary IO.

### How was this patch tested?
All UT passed.

### Was this patch authored or co-authored using generative AI tooling?
NO

Closes #1893 from ffacs/ORC-1688.

Authored-by: ffacs <ffacs@ffacs520@gmail.com>
Signed-off-by: Gang Wu <ustcwg@gmail.com>
---
 c++/src/Timezone.cc      | 71 ++++++++++++++++++++++++++++++++--------
 c++/test/TestTimezone.cc |  2 +-
 c++/test/TestWriter.cc   | 46 ++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc
index 29db7f37cd..384f8ea99f 100644
--- a/c++/src/Timezone.cc
+++ b/c++/src/Timezone.cc
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#include <atomic>
 #include <map>
 #include <sstream>
 
@@ -671,17 +672,8 @@ namespace orc {
     return dir;
   }
 
-  /**
-   * Get a timezone by absolute filename.
-   * Results are cached.
-   */
-  const Timezone& getTimezoneByFilename(const std::string& filename) {
-    // ORC-110
-    std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
-    std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = timezoneCache.find(filename);
-    if (itr != timezoneCache.end()) {
-      return *(itr->second).get();
-    }
+  static std::vector<unsigned char> loadTZDB(const std::string& filename) {
+    std::vector<unsigned char> buffer;
     if (!fileExists(filename.c_str())) {
       std::stringstream ss;
       ss << "Time zone file " << filename << " does not exist."
@@ -691,12 +683,65 @@ namespace orc {
     try {
       std::unique_ptr<InputStream> file = readFile(filename);
       size_t size = static_cast<size_t>(file->getLength());
-      std::vector<unsigned char> buffer(size);
+      buffer.resize(size);
       file->read(&buffer[0], size, 0);
-      timezoneCache[filename] = std::make_shared<TimezoneImpl>(filename, buffer);
     } catch (ParseError& err) {
       throw TimezoneError(err.what());
     }
+    return buffer;
+  }
+
+  class LazyTimezone : public Timezone {
+   private:
+    std::string filename_;
+    mutable std::unique_ptr<TimezoneImpl> impl_;
+    mutable std::once_flag initialized_;
+
+    TimezoneImpl* getImpl() const {
+      std::call_once(initialized_, [&]() {
+        auto buffer = loadTZDB(filename_);
+        impl_ = std::make_unique<TimezoneImpl>(filename_, std::move(buffer));
+      });
+      return impl_.get();
+    }
+
+   public:
+    LazyTimezone(const std::string& filename) : filename_(filename) {}
+
+    const TimezoneVariant& getVariant(int64_t clk) const override {
+      return getImpl()->getVariant(clk);
+    }
+    int64_t getEpoch() const override {
+      return getImpl()->getEpoch();
+    }
+    void print(std::ostream& os) const override {
+      return getImpl()->print(os);
+    }
+    uint64_t getVersion() const override {
+      return getImpl()->getVersion();
+    }
+
+    int64_t convertToUTC(int64_t clk) const override {
+      return getImpl()->convertToUTC(clk);
+    }
+
+    int64_t convertFromUTC(int64_t clk) const override {
+      return getImpl()->convertFromUTC(clk);
+    }
+  };
+
+  /**
+   * Get a timezone by absolute filename.
+   * Results are cached.
+   */
+  const Timezone& getTimezoneByFilename(const std::string& filename) {
+    // ORC-110
+    std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
+    std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = timezoneCache.find(filename);
+    if (itr != timezoneCache.end()) {
+      return *(itr->second).get();
+    }
+    timezoneCache[filename] = std::make_shared<LazyTimezone>(filename);
     return *timezoneCache[filename].get();
   }
 
diff --git a/c++/test/TestTimezone.cc b/c++/test/TestTimezone.cc
index 727d6a2763..94895cd700 100644
--- a/c++/test/TestTimezone.cc
+++ b/c++/test/TestTimezone.cc
@@ -431,7 +431,7 @@ namespace orc {
       ASSERT_TRUE(delEnv("TZDIR"));
     }
     ASSERT_TRUE(setEnv("TZDIR", "/path/to/wrong/tzdb"));
-    EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles"); },
+    EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles").getVersion(); },
                 testing::ThrowsMessage<TimezoneError>(testing::HasSubstr(
                     "Time zone file /path/to/wrong/tzdb/America/Los_Angeles does not exist."
                     " Please install IANA time zone database and set TZDIR env.")));
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index d160f82ff1..0d6b368c00 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -2201,6 +2201,52 @@ namespace orc {
                  std::invalid_argument);
   }
 
+  TEST_P(WriterTest, testLazyLoadTZDB) {
+    MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+    MemoryPool* pool = getDefaultPool();
+    std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:int>"));
+
+    uint64_t stripeSize = 1024;            // 1K
+    uint64_t compressionBlockSize = 1024;  // 1k
+
+    std::unique_ptr<Writer> writer =
+        createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool,
+                     &memStream, fileVersion, 0, "/ERROR/TIMEZONE");
+    std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(10);
+    StructVectorBatch* structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+    LongVectorBatch* longBatch = dynamic_cast<LongVectorBatch*>(structBatch->fields[0]);
+
+    for (uint64_t j = 0; j < 10; ++j) {
+      for (uint64_t i = 0; i < 10; ++i) {
+        longBatch->data[i] = static_cast<int64_t>(i);
+      }
+      structBatch->numElements = 10;
+      longBatch->numElements = 10;
+
+      writer->add(*batch);
+    }
+
+    writer->close();
+
+    auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
+    std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
+    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), "/ERROR/TIMEZONE");
+    EXPECT_EQ(100, reader->getNumberOfRows());
+
+    batch = rowReader->createRowBatch(10);
+    for (uint64_t j = 0; j < 10; ++j) {
+      EXPECT_TRUE(rowReader->next(*batch));
+      EXPECT_EQ(10, batch->numElements);
+
+      for (uint64_t i = 0; i < 10; ++i) {
+        structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+        longBatch = dynamic_cast<LongVectorBatch*>(structBatch->fields[0]);
+        EXPECT_EQ(i, longBatch->data[i]);
+      }
+    }
+    EXPECT_FALSE(rowReader->next(*batch));
+  }
+
   INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest,
                            Values(FileVersion::v_0_11(), FileVersion::v_0_12(),
                                   FileVersion::UNSTABLE_PRE_2_0()));