Lazy load current records

ghukill · ghukill · commit fb816839a95d · 2025-10-17T13:48:54.000-04:00
Why these changes are being introduced: Current TIMDEX records has been a consistent source of complexity and performance concerns. And, one of the defining features of TDA, so it's worth getting right. The previous approach was to materialize lightweight metadata records about all current versions of each record in memory as a DuckDB temp table. This made repeated read queries pulling only current records more efficient, but was unnecessarily loading that data into memory for operations like writing data or reading a specific run (not current record associated). It turns out that reading current records is somewhat rare, and when it does happen, it's usually a one-off request as part of a larger operation like re-indexing a source in TIM. How this addresses that need: The new approach is a hybrid between a view only (lazy evaluation) and a temporary table in memory (eager evaluation). By default, a view is created, which either a) does not get used or b) is used but only 1-2 times per session and the lazy evaluation of a view is okay. Alternatively, TIMDEXDataset can be initialized with 'preload_current_records=True' if it's known that a multiple requests for current records will be needed in the session and it's worth the time and memory hit upfront. Side effects of this change: * For most operations in the TIMDEX ETL pipeline, which don't use current records, the load time and memory usage is fairly dramatically decreased. Relevant ticket(s): - https://mitlibraries.atlassian.net/browse/USE-58 - prep work for new methods
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -259,3 +259,14 @@ def test_dataset_duckdb_context_creates_data_schema(timdex_dataset):
         ).fetchone()[0]
         == 1
     )
+
+
+def test_dataset_preload_current_records_default_false(timdex_dataset):
+    assert timdex_dataset.preload_current_records is False
+    assert timdex_dataset.metadata.preload_current_records is False
+
+
+def test_dataset_preload_current_records_flag_true(tmp_path):
+    td = TIMDEXDataset(str(tmp_path), preload_current_records=True)
+    assert td.preload_current_records is True
+    assert td.metadata.preload_current_records is True
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -6,7 +6,7 @@
 
 from duckdb import DuckDBPyConnection
 
-from timdex_dataset_api import TIMDEXDatasetMetadata
+from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
 
 ORDERED_METADATA_COLUMN_NAMES = [
     "timdex_record_id",
@@ -388,3 +388,51 @@ def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty(
     )
     assert df.secret_directory == "/tmp/.duckdb/secrets"
     assert df.extension_directory == "/tmp/.duckdb/extensions"
+
+
+def test_tdm_preload_current_records_default_false(tmp_path):
+    tdm = TIMDEXDatasetMetadata(str(tmp_path))
+    assert tdm.preload_current_records is False
+
+
+def test_tdm_preload_current_records_flag_true(tmp_path):
+    tdm = TIMDEXDatasetMetadata(str(tmp_path), preload_current_records=True)
+    assert tdm.preload_current_records is True
+
+
+def test_tdm_preload_false_no_temp_table(timdex_dataset_with_runs):
+    # instantiate TIMDEXDataset without preloading current records (default)
+    td = TIMDEXDataset(timdex_dataset_with_runs.location)
+
+    # assert that materialized, temporary table "temp.current_records" does not exist
+    temp_table_count = td.metadata.conn.query(
+        """
+        select count(*)
+        from information_schema.tables
+        where table_catalog = 'temp'
+        and table_name = 'current_records'
+        and table_type = 'LOCAL TEMPORARY'
+        ;
+        """
+    ).fetchone()[0]
+
+    assert temp_table_count == 0
+
+
+def test_tdm_preload_true_has_temp_table(timdex_dataset_with_runs):
+    # instantiate TIMDEXDataset with preloading current records
+    td = TIMDEXDataset(timdex_dataset_with_runs.location, preload_current_records=True)
+
+    # assert that materialized, temporary table "temp.current_records" does exist
+    temp_table_count = td.metadata.conn.query(
+        """
+            select count(*)
+            from information_schema.tables
+            where table_catalog = 'temp'
+            and table_name = 'current_records'
+            and table_type = 'LOCAL TEMPORARY'
+            ;
+            """
+    ).fetchone()[0]
+
+    assert temp_table_count == 1
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -107,15 +107,22 @@ class TIMDEXDataset:
     def __init__(
         self,
         location: str,
+        *,
         config: TIMDEXDatasetConfig | None = None,
+        preload_current_records: bool = False,
     ):
         """Initialize TIMDEXDataset object.
 
         Args:
-            location (str ): Local filesystem path or an S3 URI to a parquet dataset.
+            location: Local filesystem path or an S3 URI to a parquet dataset.
+            config: Optional TIMDEXDatasetConfig instance.
+            preload_current_records: if True, create in-memory temp table for
+                current_records (faster for repeated queries); if False, create view only
+                (default, lower memory)
         """
         self.config = config or TIMDEXDatasetConfig()
         self.location = location
+        self.preload_current_records = preload_current_records
 
         self.create_data_structure()
 
@@ -125,7 +132,10 @@ def __init__(
         self.dataset = self.load_pyarrow_dataset()
 
         # dataset metadata
-        self.metadata = TIMDEXDatasetMetadata(location)
+        self.metadata = TIMDEXDatasetMetadata(
+            location,
+            preload_current_records=preload_current_records,
+        )
 
         # DuckDB context
         self.conn = self.setup_duckdb_context()
@@ -145,7 +155,11 @@ def data_records_root(self) -> str:
 
     def refresh(self) -> None:
         """Fully reload TIMDEXDataset instance."""
-        self.__init__(self.location)  # type: ignore[misc]
+        self.__init__(  # type: ignore[misc]
+            self.location,
+            config=self.config,
+            preload_current_records=self.preload_current_records,
+        )
 
     def create_data_structure(self) -> None:
         """Ensure ETL records data structure exists in TIMDEX dataset."""
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py
@@ -60,14 +60,20 @@ class TIMDEXDatasetMetadata:
     def __init__(
         self,
         location: str,
+        *,
+        preload_current_records: bool = False,
     ) -> None:
         """Init TIMDEXDatasetMetadata.
 
         Args:
             location: root location of TIMDEX dataset, e.g. 's3://timdex/dataset'
+            preload_current_records: if True, create in-memory temp table for
+                current_records (faster for repeated queries); if False, create view only
+                (default, lower memory)
         """
         self.location = location
         self.config = TIMDEXDatasetMetadataConfig()
+        self.preload_current_records = preload_current_records
 
         self.create_metadata_structure()
         self.conn: DuckDBPyConnection = self.setup_duckdb_context()
@@ -444,26 +450,20 @@ def _create_current_records_view(self, conn: DuckDBPyConnection) -> None:
         dataset.  With the metadata provided from this view, we can streamline data
         retrievals in TIMDEXDataset read methods.
 
-        For performance reasons, the final view reads from a DuckDB temporary table that
-        is constructed, "temp.main.current_records".  Because our connection is in memory,
-        the data in this temporary table is mostly in memory but has the ability to spill
-        to disk if we risk getting too close to our memory constraints.  We explicitly
-        set the temporary location on disk for DuckDB at "/tmp" to play nice with contexts
-        like AWS ECS or Lambda, where sometimes the $HOME env var is missing; DuckDB
-        often tries to utilize the user's home directory and this works around that.
+        By default, creates a view only (lazy evaluation). If
+        preload_current_records=True, creates a temp table for better performance
+        for repeated queries.
+
+        For temp table mode, the data is mostly in memory but has the ability to spill to
+        disk if we risk getting too close to our memory constraints. We explicitly set the
+        temporary location on disk for DuckDB at "/tmp" to play nice with contexts like
+        AWS ECS or Lambda, where sometimes the $HOME env var is missing; DuckDB often
+        tries to utilize the user's home directory and this works around that.
         """
         logger.info("creating view of current records metadata")
 
-        conn.execute(
-            """
-            set temp_directory = '/tmp';
-            """
-        )
-
-        conn.execute(
-            """
-            -- create temp table with current records using CTEs
-            create or replace temp table temp.main.current_records as
+        # SQL for the current records logic (CTEs)
+        current_records_query = """
             with
                 -- CTE of run_timestamp for last source full run
                 cr_source_last_full as (
@@ -502,13 +502,31 @@ def _create_current_records_view(self, conn: DuckDBPyConnection) -> None:
             select
                 * exclude (rn)
             from cr_ranked_records
-            where rn = 1;
+            where rn = 1
+        """
 
-            -- create view in metadata schema
-            create or replace view metadata.current_records as
-            select * from temp.main.current_records;
-            """
-        )
+        # create temp table (materializes in memory)
+        if self.preload_current_records:
+            conn.execute("set temp_directory = '/tmp';")
+            conn.execute(
+                f"""
+                create or replace temp table temp.main.current_records as
+                {current_records_query};
+
+                -- create view in metadata schema that points to temp table
+                create or replace view metadata.current_records as
+                select * from temp.main.current_records;
+                """
+            )
+
+        # create view only (lazy evaluation)
+        else:
+            conn.execute(
+                f"""
+                create or replace view metadata.current_records as
+                {current_records_query};
+                """
+            )
 
     def merge_append_deltas(self) -> None:
         """Merge append deltas into the static metadata database file."""