MSD-1170: filter out empty buckets while calculating embeddings (#23)

lukaszkolodziejczyk · web-flow · commit 2e7585188093 · 2024-11-28T20:06:29.000+08:00
diff --git a/src/mostlyai/qa/report.py b/src/mostlyai/qa/report.py
@@ -160,7 +160,7 @@ def report(
             check_min_sample_size(syn_sample_size, 100, "synthetic")
             check_min_sample_size(trn_sample_size, 90, "training")
             if hol_tgt_data is not None:
-                check_min_sample_size(trn_sample_size, 10, "holdout")
+                check_min_sample_size(hol_sample_size, 10, "holdout")
         except PrerequisiteNotMetError as err:
             _LOG.info(err)
             statistics.mark_early_exit()
@@ -242,10 +242,12 @@ def _calc_pull_embeds(df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, start: int, st
             )
             # split into buckets for calculating embeddings to avoid memory issues and report continuous progress
             buckets = np.array_split(strings, stop - start)
+            buckets = [b for b in buckets if len(b) > 0]
             embeds = []
             for i, bucket in enumerate(buckets, 1):
                 embeds += [calculate_embeddings(bucket.tolist())]
                 on_progress(current=start + i, total=100)
+            on_progress(current=stop, total=100)
             embeds = np.concatenate(embeds, axis=0)
             _LOG.info(f"calculated embeddings {embeds.shape}")
             return embeds
diff --git a/tests/end_to_end/test_report.py b/tests/end_to_end/test_report.py
@@ -193,22 +193,32 @@ def make_dfs(
 
     for test_idx, df_dict in enumerate(test_dfs):
         ctx_df, tgt_df = df_dict.pop("dfs")
-        syn_ctx_data = trn_ctx_data = val_ctx_data = ctx_df
-        syn_tgt_data = trn_tgt_data = val_tgt_data = tgt_df
+        syn_ctx_data = trn_ctx_data = hol_ctx_data = ctx_df
+        syn_tgt_data = trn_tgt_data = hol_tgt_data = tgt_df
         early_term = df_dict.pop("early_term")
         _, metrics = report(
             syn_tgt_data=syn_tgt_data,
             trn_tgt_data=trn_tgt_data,
-            hol_tgt_data=val_tgt_data,
+            hol_tgt_data=hol_tgt_data,
             syn_ctx_data=syn_ctx_data,
             trn_ctx_data=trn_ctx_data,
-            hol_ctx_data=val_ctx_data,
+            hol_ctx_data=hol_ctx_data,
             tgt_context_key="ck",
             ctx_primary_key="pk",
         )
         assert metrics is None if early_term else metrics is not None, f"Test {test_idx} failed"
 
 
+def test_report_few_holdout_records(tmp_path):
+    tgt = pd.DataFrame({"id": list(range(100)), "col": ["a"] * 100})
+    _, metrics = report(
+        syn_tgt_data=tgt,
+        trn_tgt_data=tgt,
+        hol_tgt_data=tgt[:10],
+    )
+    assert metrics is not None
+
+
 def test_report_sequential_few_records(tmp_path):
     # ensure that we don't crash in case of dominant zero-seq-length
     ctx = pd.DataFrame({"id": list(range(1000))})