From 142afd106ebf2e2e4aad02144be6c6acd296dc92 Mon Sep 17 00:00:00 2001 From: atl-ggregson <147005258+atl-ggregson@users.noreply.github.com> Date: Thu, 16 Jan 2025 14:44:59 +1100 Subject: [PATCH] perf: Cache CSV stream schema (#363) The stream's `schema` property is accessed multiple times for each record (see `Stream._generate_record_messages()` for instance). Since the schema should be static this change caches it, resulting in a significant performance improvement. Testing with a sample 2,000,000 row dataset (`people-2000000` from https://github.com/datablist/sample-csv-files) reduced the read time from 441 seconds to 48 seconds; about a 10x improvement in throughput. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- tap_csv/client.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5aed6b9..fdadda8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tap-csv" -version = "1.1.0" +version = "1.2.0" description = "Singer tap for CSV, built with the Meltano SDK for Singer Taps." authors = ["Pat Nadolny"] keywords = [ diff --git a/tap_csv/client.py b/tap_csv/client.py index cae3892..f4b3739 100644 --- a/tap_csv/client.py +++ b/tap_csv/client.py @@ -6,6 +6,7 @@ import os import typing as t from datetime import datetime, timezone +from functools import cached_property from singer_sdk import typing as th from singer_sdk.streams import Stream @@ -121,12 +122,14 @@ def get_rows(self, file_path: str) -> t.Iterable[list]: with open(file_path, encoding=encoding) as f: yield from csv.reader(f, dialect="tap_dialect") - @property + @cached_property def schema(self) -> dict: """Return dictionary of record schema. Dynamically detect the json schema for the stream. - This is evaluated prior to any records being retrieved. + + This property is accessed multiple times for each record + so it's important to cache the result. """ properties: list[th.Property] = [] self.primary_keys = self.file_config.get("keys", [])