Skip to content

Commit

Permalink
ORC-1688: [C++] Do not access TZDB if there is no timestamp type
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
1. Don't write writer's Time Zone name when there are no timestamp types.
2. Do not access TZDB if there is no timestamp type.

### Why are the changes needed?
This patch could accelerate reading by reducing unnecessary IO.

### How was this patch tested?
All UT passed.

### Was this patch authored or co-authored using generative AI tooling?
NO

Closes #1893 from ffacs/ORC-1688.

Authored-by: ffacs <ffacs@[email protected]>
Signed-off-by: Gang Wu <[email protected]>
  • Loading branch information
ffacs authored and wgtmac committed Apr 23, 2024
1 parent d4f13dc commit d13e569
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 14 deletions.
71 changes: 58 additions & 13 deletions c++/src/Timezone.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <atomic>
#include <map>
#include <sstream>

Expand Down Expand Up @@ -671,17 +672,8 @@ namespace orc {
return dir;
}

/**
* Get a timezone by absolute filename.
* Results are cached.
*/
const Timezone& getTimezoneByFilename(const std::string& filename) {
// ORC-110
std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = timezoneCache.find(filename);
if (itr != timezoneCache.end()) {
return *(itr->second).get();
}
static std::vector<unsigned char> loadTZDB(const std::string& filename) {
std::vector<unsigned char> buffer;
if (!fileExists(filename.c_str())) {
std::stringstream ss;
ss << "Time zone file " << filename << " does not exist."
Expand All @@ -691,12 +683,65 @@ namespace orc {
try {
std::unique_ptr<InputStream> file = readFile(filename);
size_t size = static_cast<size_t>(file->getLength());
std::vector<unsigned char> buffer(size);
buffer.resize(size);
file->read(&buffer[0], size, 0);
timezoneCache[filename] = std::make_shared<TimezoneImpl>(filename, buffer);
} catch (ParseError& err) {
throw TimezoneError(err.what());
}
return buffer;
}

class LazyTimezone : public Timezone {
private:
std::string filename_;
mutable std::unique_ptr<TimezoneImpl> impl_;
mutable std::once_flag initialized_;

TimezoneImpl* getImpl() const {
std::call_once(initialized_, [&]() {
auto buffer = loadTZDB(filename_);
impl_ = std::make_unique<TimezoneImpl>(filename_, std::move(buffer));
});
return impl_.get();
}

public:
LazyTimezone(const std::string& filename) : filename_(filename) {}

const TimezoneVariant& getVariant(int64_t clk) const override {
return getImpl()->getVariant(clk);
}
int64_t getEpoch() const override {
return getImpl()->getEpoch();
}
void print(std::ostream& os) const override {
return getImpl()->print(os);
}
uint64_t getVersion() const override {
return getImpl()->getVersion();
}

int64_t convertToUTC(int64_t clk) const override {
return getImpl()->convertToUTC(clk);
}

int64_t convertFromUTC(int64_t clk) const override {
return getImpl()->convertFromUTC(clk);
}
};

/**
* Get a timezone by absolute filename.
* Results are cached.
*/
const Timezone& getTimezoneByFilename(const std::string& filename) {
// ORC-110
std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = timezoneCache.find(filename);
if (itr != timezoneCache.end()) {
return *(itr->second).get();
}
timezoneCache[filename] = std::make_shared<LazyTimezone>(filename);
return *timezoneCache[filename].get();
}

Expand Down
2 changes: 1 addition & 1 deletion c++/test/TestTimezone.cc
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ namespace orc {
ASSERT_TRUE(delEnv("TZDIR"));
}
ASSERT_TRUE(setEnv("TZDIR", "/path/to/wrong/tzdb"));
EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles"); },
EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles").getVersion(); },
testing::ThrowsMessage<TimezoneError>(testing::HasSubstr(
"Time zone file /path/to/wrong/tzdb/America/Los_Angeles does not exist."
" Please install IANA time zone database and set TZDIR env.")));
Expand Down
46 changes: 46 additions & 0 deletions c++/test/TestWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2201,6 +2201,52 @@ namespace orc {
std::invalid_argument);
}

TEST_P(WriterTest, testLazyLoadTZDB) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool* pool = getDefaultPool();
std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:int>"));

uint64_t stripeSize = 1024; // 1K
uint64_t compressionBlockSize = 1024; // 1k

std::unique_ptr<Writer> writer =
createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool,
&memStream, fileVersion, 0, "/ERROR/TIMEZONE");
std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(10);
StructVectorBatch* structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
LongVectorBatch* longBatch = dynamic_cast<LongVectorBatch*>(structBatch->fields[0]);

for (uint64_t j = 0; j < 10; ++j) {
for (uint64_t i = 0; i < 10; ++i) {
longBatch->data[i] = static_cast<int64_t>(i);
}
structBatch->numElements = 10;
longBatch->numElements = 10;

writer->add(*batch);
}

writer->close();

auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), "/ERROR/TIMEZONE");
EXPECT_EQ(100, reader->getNumberOfRows());

batch = rowReader->createRowBatch(10);
for (uint64_t j = 0; j < 10; ++j) {
EXPECT_TRUE(rowReader->next(*batch));
EXPECT_EQ(10, batch->numElements);

for (uint64_t i = 0; i < 10; ++i) {
structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
longBatch = dynamic_cast<LongVectorBatch*>(structBatch->fields[0]);
EXPECT_EQ(i, longBatch->data[i]);
}
}
EXPECT_FALSE(rowReader->next(*batch));
}

INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest,
Values(FileVersion::v_0_11(), FileVersion::v_0_12(),
FileVersion::UNSTABLE_PRE_2_0()));
Expand Down

0 comments on commit d13e569

Please sign in to comment.