From ddc9b1b1a7d200216d0e5cce82387a6ae1003b2c Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Thu, 23 Apr 2026 15:25:39 +0100
Subject: [PATCH 1/8] examples: add fraud-detection example (IEEE-CIS)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Self-contained reproduction of Weco's fraud-detection case study. Downloads
the Kaggle dataset, builds a leakage-safe 100K/25K time-based parquet split,
and exposes train.py as the optimization target (feature engineering +
LightGBM config both modifiable). evaluate.py prints auc_roc for Weco.

instructions.md is the full EDA + techniques prompt from the case study —
column semantics for each feature group (TransactionAmt, C/D/M/V), 10
well-known IEEE-CIS techniques (UID construction, target encoding with OOF,
velocity features, frequency encoding), and a target-leakage guardrail
pointing out the isFraud-in-df aggregation trap.

README walks through Kaggle API setup, prepare_data step, baseline sanity
check (~0.914 AUC), and the canonical weco run command
(gemini-3.1-pro-preview, 50 steps, expected trajectory into 0.928-0.933).
Also adds 'things to try' (no-instructions variance blow-up, EDA-only
ablation, scope restriction) and a silent-target-leakage watch-out pointing
to the published case study.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/README.md                        |  29 ++++-
 examples/fraud-detection/.gitignore       |   4 +
 examples/fraud-detection/README.md        | 145 ++++++++++++++++++++++
 examples/fraud-detection/evaluate.py      |  35 ++++++
 examples/fraud-detection/instructions.md  |  99 +++++++++++++++
 examples/fraud-detection/prepare_data.py  | 130 +++++++++++++++++++
 examples/fraud-detection/requirements.txt |   7 ++
 examples/fraud-detection/train.py         | 122 ++++++++++++++++++
 8 files changed, 570 insertions(+), 1 deletion(-)
 create mode 100644 examples/fraud-detection/.gitignore
 create mode 100644 examples/fraud-detection/README.md
 create mode 100644 examples/fraud-detection/evaluate.py
 create mode 100644 examples/fraud-detection/instructions.md
 create mode 100644 examples/fraud-detection/prepare_data.py
 create mode 100644 examples/fraud-detection/requirements.txt
 create mode 100644 examples/fraud-detection/train.py

diff --git a/examples/README.md b/examples/README.md
index 8b91d5b..dbfa4b8 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -16,6 +16,7 @@ Explore runnable examples that show how to use Weco to optimize ML models, promp
   - [🧠 Prompt Engineering](#-prompt-engineering)
   - [📊 Extract Line Plot — Chart to CSV](#-extract-line-plot--chart-to-csv)
   - [🛰️ Model Development — Spaceship Titanic](#️-model-development--spaceship-titanic)
+  - [🕵️ Fraud Detection — IEEE-CIS](#️-fraud-detection--ieee-cis)
 
 ### Prerequisites
 
@@ -35,6 +36,7 @@ pip install weco
 | 🧠 Prompt Engineering | Iteratively refine LLM prompts to improve accuracy | `openai`, `datasets`, OpenAI API key | [README](prompt/README.md) |
 | 📊 Agentic Scaffolding | Optimize agentic scaffolding for chart-to-CSV extraction | `openai`, `huggingface_hub`, `uv`, OpenAI API key | [README](extract-line-plot/README.md) |
 | 🛰️ Spaceship Titanic | Improve a Kaggle model training pipeline | `pandas`, `numpy`, `scikit-learn`, `torch`, `xgboost`, `lightgbm`, `catboost` | [README](spaceship-titanic/README.md) |
+| 🕵️ Fraud Detection | Optimize a fraud pipeline on IEEE-CIS (real Vesta transactions) | `pandas`, `numpy`, `scikit-learn`, `lightgbm`, `pyarrow`, `kaggle` | [README](fraud-detection/README.md) |
 
 ---
 
@@ -162,8 +164,33 @@ weco run --source train.py \
      --log-dir .runs/spaceship-titanic
 ```
 
+### 🕵️ Fraud Detection — IEEE-CIS
+
+Optimize a tabular fraud-detection pipeline on real Vesta payment data.
+Reproduces Weco's
+[fraud-detection case study](https://weco.ai/blog/framing-the-problem)
+(baseline AUC 0.914 → pooled 6-seed mean 0.9305 ± 0.0035 with full
+instructions at 200 steps).
+
+- **Prereqs**: Kaggle API token + [join the competition](https://www.kaggle.com/c/ieee-fraud-detection)
+- **Install Dependencies**: `pip install -r requirements.txt`
+- **Prepare data** (once, ~2-3 min): `python prepare_data.py`
+- **Run**:
+```bash
+cd examples/fraud-detection
+weco run --source train.py \
+     --eval-command "python evaluate.py" \
+     --metric auc_roc \
+     --goal maximize \
+     --steps 50 \
+     --model gemini-3.1-pro-preview \
+     --additional-instructions instructions.md \
+     --eval-timeout 300 \
+     --log-dir .runs/fraud-detection
+```
+
 ---
 
-If you're new to Weco, start with **Hello World**, then try **LangSmith ZephHR QA** for a realistic LangSmith optimization workflow, explore **Triton** and **CUDA** for kernel engineering, **Prompt Engineering** for optimzing an LLM's prompt, **Extract Line Plot** for optimzing agentic scaffolds, or **Spaceship Titanic** for model development.
+If you're new to Weco, start with **Hello World**, then try **LangSmith ZephHR QA** for a realistic LangSmith optimization workflow, explore **Triton** and **CUDA** for kernel engineering, **Prompt Engineering** for optimzing an LLM's prompt, **Extract Line Plot** for optimzing agentic scaffolds, **Spaceship Titanic** for model development, or **Fraud Detection** for a production-scale tabular ML case study.
 
 
diff --git a/examples/fraud-detection/.gitignore b/examples/fraud-detection/.gitignore
new file mode 100644
index 0000000..60f2536
--- /dev/null
+++ b/examples/fraud-detection/.gitignore
@@ -0,0 +1,4 @@
+data/
+.runs/
+__pycache__/
+*.pyc
diff --git a/examples/fraud-detection/README.md b/examples/fraud-detection/README.md
new file mode 100644
index 0000000..be05c22
--- /dev/null
+++ b/examples/fraud-detection/README.md
@@ -0,0 +1,145 @@
+# Fraud Detection (IEEE-CIS)
+
+Optimize a tabular fraud-detection pipeline on the
+[IEEE-CIS Fraud Detection](https://www.kaggle.com/c/ieee-fraud-detection) Kaggle
+dataset (real Vesta payment transactions). Weco rewrites `train.py` — both
+feature engineering and the LightGBM configuration — to maximize AUC-ROC on a
+held-out, time-based validation split.
+
+This example reproduces the setup from Weco's fraud-detection case study
+([blog post](https://weco.ai/blog/framing-the-problem),
+[code](https://github.com/WecoAI/fraud-detection-case-study)). Expected
+improvement: **baseline ≈ 0.914 → full-pipeline pooled mean 0.9305 ± 0.0035**
+after 200 steps with `gemini-3.1-pro-preview` and the instructions in
+`instructions.md`.
+
+## Prerequisites
+
+1. **Kaggle API token**. Put a valid `kaggle.json` at `~/.kaggle/kaggle.json`
+   (see [Kaggle API credentials](https://github.com/Kaggle/kaggle-api#api-credentials)).
+2. **Join the competition**. Visit
+   <https://www.kaggle.com/c/ieee-fraud-detection> and accept the rules, or
+   the download will 403.
+3. **Weco API key** (free tier is fine). See the
+   [Weco docs](https://docs.weco.ai).
+
+## Setup
+
+```bash
+cd examples/fraud-detection
+pip install -r requirements.txt
+
+# Downloads ~120MB of CSVs, builds a small 100K/25K parquet split.
+# Time-based split: last 20% of transactions by TransactionDT = validation.
+# ~2-3 minutes on a modern laptop.
+python prepare_data.py
+```
+
+After this you should have:
+
+```
+data/
+  train_transaction.csv, train_identity.csv, test_*.csv  # raw
+  base_train_small.parquet   # 100K rows, time-ordered
+  base_val_small.parquet     # 25K rows, later in time
+```
+
+## Quick sanity check
+
+Run the baseline once to confirm everything loads:
+
+```bash
+python evaluate.py
+# → auc_roc: 0.914xxx   (takes ~30s)
+```
+
+If you see an AUC in the 0.91-0.92 range, you're ready.
+
+## Run Weco
+
+The "default" run uses the full EDA + techniques instructions (recommended —
+they contain the column semantics and known-good techniques for this dataset):
+
+```bash
+weco run --source train.py \
+     --eval-command "python evaluate.py" \
+     --metric auc_roc \
+     --goal maximize \
+     --steps 50 \
+     --model gemini-3.1-pro-preview \
+     --additional-instructions instructions.md \
+     --eval-timeout 300 \
+     --log-dir .runs/fraud-detection
+```
+
+Expected trajectory:
+
+- Steps 1–10: Weco explores — tries log-amount, simple aggregations, category
+  encodings. AUC moves into 0.918-0.925.
+- Steps 10–50: builds UID-style features (card1 + addr1 + account-creation
+  estimate via `D1`), target encoding with out-of-fold protection, velocity
+  features. AUC climbs to 0.928-0.933.
+- Beyond step 50: diminishing returns; the pooled mean across 6 seeds in our
+  case study was 0.9305 ± 0.0035.
+
+## Explanation
+
+- `--source train.py` — the file Weco rewrites. Both `build_features` and
+  `train_and_evaluate` are fair game.
+- `--eval-command "python evaluate.py"` — called after every proposed edit;
+  reimports `train.py`, runs the pipeline, prints `auc_roc: 0.xxxxxx`. Weco
+  parses the last line matching `--metric`.
+- `--metric auc_roc --goal maximize` — Weco optimizes the metric printed by
+  the evaluator.
+- `--additional-instructions instructions.md` — injects domain context into
+  every optimization step. **This is what mostly matters.** See the
+  case study: EDA-level instructions (what each column means in this
+  specific dataset) drive most of the gain. Kaggle-classic techniques are
+  typically already in the LLM's pretraining distribution. Feed the optimizer
+  what it couldn't already know — dataset-specific semantics, proprietary
+  heuristics, internal constraints.
+- `--eval-timeout 300` — one eval takes ~30-60s; 300s gives headroom for
+  feature-heavy proposals.
+
+## Things to try
+
+1. **No instructions baseline**: remove `--additional-instructions` and watch
+   variance across seeds balloon (std ~0.008 vs ~0.002 with instructions).
+   Also watch for silently-leaky proposals (see below).
+2. **EDA only**: keep only the column-meaning section of `instructions.md` —
+   the case study found this accounts for most of the mean gain.
+3. **Scope restriction**: point Weco at `train.py`'s `build_features` only by
+   editing the file to expose just that function (or split the pipeline into
+   `features.py` + `model.py`). In our case study, features-only delivered
+   most of the improvement that full-pipeline did.
+
+## Watch out for silent target leakage
+
+IEEE-CIS is a known trap for automated optimizers. A plausible idea like
+"count how many columns are zero per row" becomes leaky if the dataframe
+still contains `isFraud`, because fraud rows contribute a different count
+than non-fraud rows. The `build_features` in `train.py` drops `isFraud` and
+`TransactionID` before any cross-column aggregation — don't let proposals
+reintroduce aggregations on a dataframe that still contains the label.
+
+Signs to check for when a run reports a surprisingly high AUC (> 0.95 on this
+subsample):
+
+- Any `df.sum`/`df.mean`/`(df == x)` across all columns before the label is
+  dropped.
+- Target encoding without out-of-fold protection (encoder fit on train + val
+  concat).
+- Features computed using validation data (time-leakage: using `val_df` in
+  `train`'s feature-engineering step).
+
+The case study walks through a real instance where an uninstructed run
+reported AUC 0.9591 that dropped to 0.9154 after a one-line fix. See
+<https://weco.ai/blog/framing-the-problem>.
+
+## Citing the case study
+
+If you use this example, the underlying numbers come from
+<https://github.com/WecoAI/fraud-detection-case-study>. Setup: 200 steps,
+3 seeds per condition (6 for the Full pipeline + Full-instructions condition,
+pooled since the two ablations share that configuration),
+`gemini-3.1-pro-preview`.
diff --git a/examples/fraud-detection/evaluate.py b/examples/fraud-detection/evaluate.py
new file mode 100644
index 0000000..d3ad2d6
--- /dev/null
+++ b/examples/fraud-detection/evaluate.py
@@ -0,0 +1,35 @@
+"""Evaluator Weco calls after each proposed edit.
+
+Loads train.py fresh each run (Weco rewrites it in place), executes the
+pipeline, and prints a single `auc_roc: 0.xxxxxx` line that Weco parses as
+the metric.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+
+
+def load_module(path: str):
+    spec = importlib.util.spec_from_file_location("train_under_test", path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def main() -> int:
+    train = load_module(str(Path(__file__).parent / "train.py"))
+    auc = train.run_pipeline()
+
+    if not (0.0 <= auc <= 1.0):
+        print(f"Constraint violated: AUC-ROC out of range ({auc})")
+        return 1
+
+    print(f"auc_roc: {auc:.6f}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/fraud-detection/instructions.md b/examples/fraud-detection/instructions.md
new file mode 100644
index 0000000..cfef410
--- /dev/null
+++ b/examples/fraud-detection/instructions.md
@@ -0,0 +1,99 @@
+# Fraud Detection Optimization Instructions
+
+## Task
+Optimize `train.py` to maximize AUC-ROC for fraud detection on the IEEE-CIS dataset. You may modify both `build_features` (feature engineering) and `train_and_evaluate` (model config). Keep `run_pipeline`'s interface and the `auc_roc: 0.xxxxxx` print format unchanged so the evaluator can parse the metric.
+
+## Dataset Details
+- 100K train / 25K val, 3.5% fraud rate, time-based split
+- Base data has 297 columns after V-feature correlation pruning
+- Categoricals are already label-encoded as integers
+- TransactionDT is in seconds (timedelta from reference date, NOT a timestamp)
+
+## Column Meanings (from Kaggle community reverse-engineering)
+
+### Raw columns
+- **TransactionAmt**: USD amount. Heavy-tailed (median $68, max $4578). Log transform essential.
+- **ProductCD**: Product type (5 categories: C, H, R, S, W). Each has a distinct V-feature NaN pattern and fraud rate (C=11%, W=2.1%).
+- **card1**: Bank Identification Number (BIN) — first 6 digits of card. Top-3 importance.
+- **card2**: Additional card info. 1.5% NaN. Top-3 importance.
+- **card3/card5**: Card country/product type codes.
+- **card4**: Card network (visa, mastercard, etc).
+- **card6**: Card type (credit, debit).
+- **addr1**: Billing zip code (anonymized). 11.5% NaN.
+- **addr2**: Billing country.
+- **P_emaildomain**: Purchaser email domain (gmail.com, yahoo.com, etc).
+- **R_emaildomain**: Recipient email domain. Mismatch between P and R = fraud signal.
+- **dist1/dist2**: Distance features.
+
+### C-features (C1-C14): Entity occurrence COUNTS, no NaN
+- **C1** (importance rank #2): Count of addresses associated with the payment card
+- **C2**: Count of cards at the billing address
+- **C5**: Count of email addresses seen with this card
+- **C11**: Count of cards associated with a user identity
+- **C12**: Count of addresses associated with a user identity
+- **C13** (importance rank #4): Count of distinct email domains per entity — **one of the single most predictive raw features**. High values = fraud ring.
+- **C14** (importance rank #3): Related count feature
+
+### D-features (D1-D15): TIMEDELTA in days between events
+- **D1** (0.2% NaN, median 1 day): Days since last transaction. Most important D-feature. `TransactionDT/86400 - D1` estimates the **account creation date** — this is the key insight for UID construction.
+- **D2** (49% NaN, median 97 days): Days since card was first associated with the identity
+- **D3** (46% NaN): Days since last similar transaction
+- **D4** (29.5% NaN): Days since email association
+- **D10** (14% NaN): Days since last device-linked transaction
+- **D11** (52% NaN): Days since account was opened / account age
+- **D15** (16.5% NaN, median 46 days): Days since last transaction (alternative)
+- D-feature NaN rates themselves are informative — missingness patterns encode transaction type
+
+### M-features (M1-M9): Binary MATCH indicators
+Whether certain attributes match each other (name↔address, card↔billing, device↔historical, etc). Sum of True values, count of NaN, and the M-vector signature are all useful.
+
+### V-features (V1-V339, ~202 after pruning): Vesta-engineered risk signals
+Grouped by ProductCD — each product type uses a different subset of V-features (others are NaN). V258 is the #1 most important feature overall (gain=16703). Other important V-features: V283, V69, V130, V307, V294, V201.
+
+## Top Winning Techniques (from 1st-3rd place solutions)
+
+### 1. UID Construction (THE most impactful single technique)
+```python
+D1_start = floor(TransactionDT / 86400 - D1)  # estimated account creation day
+uid = card1 + "_" + addr1 + "_" + D1_start
+```
+This creates a stable user fingerprint. All aggregation features should be computed on this UID.
+
+### 2. UID-level aggregation features
+For each UID, compute: mean, std, count of TransactionAmt. Then z-score and ratio for each transaction relative to user's history. This captures "is this transaction unusual for this user?"
+
+### 3. Temporal centroid distance
+Compute the user's typical time-of-day using cyclical hour_sin/hour_cos means. The Euclidean distance of the current transaction from the centroid = "is this at an unusual time for this user?"
+
+### 4. D-feature lifecycle lags
+D1 - D2, D1 - D4, D1 - D10, D1 - D15: Inconsistencies between these timestamps indicate synthetic identities or account takeovers.
+
+### 5. Velocity features (sort by [uid, TransactionDT])
+Time since last transaction per user. Amount change from previous transaction. High velocity + high amount = fraud signal.
+
+### 6. Cross-entity cardinality (nunique)
+How many unique addr1 values per card1? How many unique card1 per addr1? How many unique P_emaildomain per uid? High cardinality = suspicious.
+
+### 7. NaN pattern signature
+The binary NaN/not-NaN pattern across D+M columns encodes the transaction type. Compute a bitwise signature or just count NaN per feature group.
+
+### 8. Frequency encoding
+For card1, card2, addr1, P_emaildomain, etc. — map each value to its frequency. Rare values (appearing once or twice) are fraud signals.
+
+### 9. Interaction features
+- amount_zscore × time_distance (unusual amount at unusual time)
+- amount_zscore × C1_ratio (unusual amount with unusual address count)
+- amount / (D1 + 1) = spending rate per day since last transaction
+
+### 10. Row-wise missingness features
+Count of NaN values across D-columns, M-columns, V-columns per row. Sum/mean of M-column values. The NaN pattern encodes the transaction profile.
+
+## Important Constraints
+- Keep code under 300 lines (Weco backend limit)
+- Use n_jobs=4 for any model operations
+- `train.py` loads `data/base_train_small.parquet` and `data/base_val_small.parquet` — don't change these paths
+- Categoricals are already integer-encoded — treat them as numeric
+- Keep the `run_pipeline() -> float` function signature and the `auc_roc: 0.xxxxxx` print format intact
+
+## Avoiding silent target leakage
+`isFraud` is the label. If you compute features that aggregate across all columns of the dataframe (e.g. `(df == 0).sum(axis=1)`, row-wise NaN counts over the entire frame), drop `isFraud` and `TransactionID` first. Otherwise the label signal bleeds into the features and produces implausibly high AUC (>0.95) that collapses the moment the fix is applied. Target encoding must use out-of-fold protection: compute encoding on train folds only, never on the full train + val concat.
diff --git a/examples/fraud-detection/prepare_data.py b/examples/fraud-detection/prepare_data.py
new file mode 100644
index 0000000..f9ab9c7
--- /dev/null
+++ b/examples/fraud-detection/prepare_data.py
@@ -0,0 +1,130 @@
+"""Download IEEE-CIS data, build base features, subsample to a small split.
+
+Produces `data/base_train_small.parquet` and `data/base_val_small.parquet` that
+`train.py` loads. The split is time-based (the last 20% of transactions by
+TransactionDT are held out for validation), which mirrors production fraud
+detection: you never train on future data.
+
+Usage:
+    # 1. Put your Kaggle API token at ~/.kaggle/kaggle.json
+    #    (see https://github.com/Kaggle/kaggle-api#api-credentials)
+    # 2. Join the competition on kaggle.com/c/ieee-fraud-detection to accept rules
+    # 3. Run:
+    python prepare_data.py
+
+Runtime: ~2-3 minutes on a modern laptop. Produces ~150MB of parquet files.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+import zipfile
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+DATA_DIR = Path(__file__).parent / "data"
+TRAIN_SIZE = 100_000
+VAL_SIZE = 25_000
+TIME_SPLIT_FRAC = 0.8  # first 80% of transactions by time = train candidates
+SEED = 42
+
+
+def download_kaggle() -> None:
+    """Download ieee-fraud-detection via the Kaggle CLI."""
+    DATA_DIR.mkdir(exist_ok=True)
+    txn = DATA_DIR / "train_transaction.csv"
+    ident = DATA_DIR / "train_identity.csv"
+    if txn.exists() and ident.exists():
+        print(f"[skip] raw CSVs already present in {DATA_DIR}")
+        return
+
+    print(f"[download] kaggle competitions download -c ieee-fraud-detection -p {DATA_DIR}")
+    subprocess.check_call(
+        ["kaggle", "competitions", "download", "-c", "ieee-fraud-detection",
+         "-p", str(DATA_DIR)]
+    )
+    zip_path = DATA_DIR / "ieee-fraud-detection.zip"
+    print(f"[extract] {zip_path}")
+    with zipfile.ZipFile(zip_path) as zf:
+        zf.extractall(DATA_DIR)
+    zip_path.unlink()
+
+
+def build_base_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Minimal, leakage-safe preprocessing so train.py has a clean starting point.
+
+    - Drop test-specific columns
+    - Label-encode object columns (LightGBM doesn't take strings)
+    - Reduce highly correlated V-features (drop one per cluster with r > 0.95)
+      to keep train.py's input dimensionality manageable
+    """
+    # Label-encode all object columns. Keep isFraud/TransactionID/TransactionDT intact.
+    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
+    for col in obj_cols:
+        df[col] = df[col].astype("category").cat.codes.astype(np.int32)
+
+    # Reduce V-features by correlation clustering (done on a sample for speed).
+    v_cols = [c for c in df.columns if c.startswith("V")]
+    if v_cols:
+        sample = df[v_cols].sample(n=min(10_000, len(df)), random_state=SEED)
+        corr = sample.corr().abs()
+        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
+        to_drop = [c for c in upper.columns if (upper[c] > 0.95).any()]
+        df = df.drop(columns=to_drop)
+        print(f"[v-reduce] dropped {len(to_drop)}/{len(v_cols)} correlated V-features")
+
+    return df
+
+
+def time_based_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    df = df.sort_values("TransactionDT").reset_index(drop=True)
+    split_point = df["TransactionDT"].quantile(TIME_SPLIT_FRAC)
+    train = df[df["TransactionDT"] <= split_point].copy()
+    val = df[df["TransactionDT"] > split_point].copy()
+    return train, val
+
+
+def subsample(df: pd.DataFrame, n: int, label: str) -> pd.DataFrame:
+    if len(df) <= n:
+        return df
+    sampled = df.sample(n=n, random_state=SEED).sort_values("TransactionDT").reset_index(drop=True)
+    fraud_rate = sampled["isFraud"].mean()
+    print(f"[subsample] {label}: {len(df)} -> {len(sampled)} (fraud rate {fraud_rate:.3%})")
+    return sampled
+
+
+def main() -> None:
+    download_kaggle()
+
+    train_out = DATA_DIR / "base_train_small.parquet"
+    val_out = DATA_DIR / "base_val_small.parquet"
+    if train_out.exists() and val_out.exists():
+        print(f"[skip] {train_out.name} and {val_out.name} already exist")
+        return
+
+    print("[load] merging train_transaction + train_identity")
+    txn = pd.read_csv(DATA_DIR / "train_transaction.csv")
+    ident = pd.read_csv(DATA_DIR / "train_identity.csv")
+    df = txn.merge(ident, on="TransactionID", how="left")
+    print(f"[load] shape={df.shape}, fraud rate {df['isFraud'].mean():.3%}")
+
+    df = build_base_features(df)
+
+    print("[split] time-based 80/20")
+    train_df, val_df = time_based_split(df)
+    print(f"[split] train={len(train_df)} val={len(val_df)}")
+
+    train_small = subsample(train_df, TRAIN_SIZE, "train")
+    val_small = subsample(val_df, VAL_SIZE, "val")
+
+    train_small.to_parquet(train_out, index=False)
+    val_small.to_parquet(val_out, index=False)
+    print(f"[write] {train_out}")
+    print(f"[write] {val_out}")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/fraud-detection/requirements.txt b/examples/fraud-detection/requirements.txt
new file mode 100644
index 0000000..188fe96
--- /dev/null
+++ b/examples/fraud-detection/requirements.txt
@@ -0,0 +1,7 @@
+weco
+numpy>=1.24
+pandas>=2.0
+scikit-learn>=1.3
+lightgbm>=4.0
+pyarrow>=13.0
+kaggle>=1.6
diff --git a/examples/fraud-detection/train.py b/examples/fraud-detection/train.py
new file mode 100644
index 0000000..7e3faa8
--- /dev/null
+++ b/examples/fraud-detection/train.py
@@ -0,0 +1,122 @@
+"""Baseline fraud-detection pipeline on IEEE-CIS. Weco will optimize this file.
+
+Weco can modify anything in `build_features` and `train_and_evaluate`. The
+`run_pipeline` function is the entry point called by `evaluate.py`.
+
+Keep the final print format exactly as `auc_roc: 0.xxxxxx` so Weco can parse
+the metric. Everything else is fair game to rewrite.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import lightgbm as lgb
+from sklearn.metrics import roc_auc_score
+
+
+def build_features(
+    train_df: pd.DataFrame, val_df: pd.DataFrame
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Build features from the base data. Returns (X_train, y_train, X_val, y_val).
+
+    This is a small starting set. Weco can replace or extend it — the case study
+    found UID-based aggregations (card1 + addr1 + account-creation-day estimate),
+    target encoding with out-of-fold protection, frequency encoding, and velocity
+    features are the most impactful additions.
+    """
+    y_train = train_df["isFraud"].values.astype(np.int32)
+    y_val = val_df["isFraud"].values.astype(np.int32)
+
+    n_train = len(train_df)
+    df = pd.concat([train_df, val_df], axis=0, ignore_index=True)
+
+    # Drop the label BEFORE any cross-column aggregation to avoid target leakage.
+    df = df.drop(columns=["isFraud", "TransactionID"])
+
+    # --- Time features from TransactionDT (seconds offset from a reference date) ---
+    df["hour"] = (df["TransactionDT"] // 3600) % 24
+    df["day_of_week"] = (df["TransactionDT"] // 86400) % 7
+    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
+    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
+
+    # --- Amount features ---
+    df["TransactionAmt_log"] = np.log1p(df["TransactionAmt"])
+    df["TransactionAmt_decimal"] = (
+        df["TransactionAmt"] - df["TransactionAmt"].astype(int)
+    ).round(2)
+    df["TransactionAmt_is_round"] = (df["TransactionAmt_decimal"] == 0).astype(np.int8)
+
+    # --- Simple aggregations on card1 / addr1 ---
+    for key in ["card1", "addr1"]:
+        grp = df.groupby(key)["TransactionAmt"]
+        df[f"{key}_amt_mean"] = grp.transform("mean")
+        df[f"{key}_amt_std"] = grp.transform("std").fillna(0)
+        df[f"{key}_amt_count"] = grp.transform("count")
+
+    # --- Frequency encoding for high-cardinality categoricals ---
+    for col in ["card1", "card2", "card5", "addr1"]:
+        if col in df.columns:
+            freq = df[col].value_counts(normalize=True)
+            df[f"{col}_freq"] = df[col].map(freq).fillna(0)
+
+    df = df.drop(columns=["TransactionDT"])
+    X = df.values.astype(np.float32)
+    return X[:n_train], y_train, X[n_train:], y_val
+
+
+def train_and_evaluate(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_val: np.ndarray,
+    y_val: np.ndarray,
+) -> float:
+    """Train LightGBM and return AUC-ROC on the validation set.
+
+    Reasonable-but-not-heavily-tuned hyperparameters. A fraud team would
+    typically run Optuna for 50-100 trials on these — there is headroom.
+    """
+    params = {
+        "objective": "binary",
+        "metric": "auc",
+        "boosting_type": "gbdt",
+        "learning_rate": 0.05,
+        "num_leaves": 127,
+        "max_depth": -1,
+        "min_child_samples": 50,
+        "subsample": 0.8,
+        "colsample_bytree": 0.8,
+        "reg_alpha": 0.1,
+        "reg_lambda": 1.0,
+        "scale_pos_weight": 1,
+        "n_jobs": 4,
+        "verbose": -1,
+        "seed": 42,
+    }
+
+    train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
+    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, free_raw_data=False)
+
+    model = lgb.train(
+        params,
+        train_data,
+        num_boost_round=1000,
+        valid_sets=[val_data],
+        valid_names=["val"],
+        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)],
+    )
+
+    y_pred = model.predict(X_val)
+    return float(roc_auc_score(y_val, y_pred))
+
+
+def run_pipeline() -> float:
+    train_df = pd.read_parquet("data/base_train_small.parquet")
+    val_df = pd.read_parquet("data/base_val_small.parquet")
+    X_train, y_train, X_val, y_val = build_features(train_df, val_df)
+    return train_and_evaluate(X_train, y_train, X_val, y_val)
+
+
+if __name__ == "__main__":
+    auc = run_pipeline()
+    print(f"auc_roc: {auc:.6f}")

From 92cb31d6a45b6aca5ec34b7d6b56292c1db5e251 Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Thu, 23 Apr 2026 15:31:35 +0100
Subject: [PATCH 2/8] examples/fraud-detection: fix venv + kaggle invocation
 after fresh-user test

Two fresh-agent test rounds surfaced three issues; all fixed:

- kaggle CLI: the `kaggle` package has no __main__, so `python -m kaggle`
  crashes with ModuleNotFoundError. Correct entry point is `kaggle.cli`.
- venv instruction used `python -m venv`, which fails on Debian/Ubuntu
  systems where only `python3` exists (no python-is-python3). Changed to
  `python3 -m venv`. After activation `python` resolves correctly.
- pip-install fails on modern PEP 668 systems without a venv. README now
  leads with the venv setup before the install step, with a note on why.

Also: prepare_data.py now catches Kaggle CalledProcessError and prints
the two most common root causes (rules not accepted / kaggle.json perms)
with the exact URL to accept the competition rules.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/fraud-detection/README.md       | 20 ++++++++++++++++----
 examples/fraud-detection/prepare_data.py | 23 +++++++++++++++++++----
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/examples/fraud-detection/README.md b/examples/fraud-detection/README.md
index be05c22..5e61f06 100644
--- a/examples/fraud-detection/README.md
+++ b/examples/fraud-detection/README.md
@@ -16,10 +16,13 @@ after 200 steps with `gemini-3.1-pro-preview` and the instructions in
 ## Prerequisites
 
 1. **Kaggle API token**. Put a valid `kaggle.json` at `~/.kaggle/kaggle.json`
-   (see [Kaggle API credentials](https://github.com/Kaggle/kaggle-api#api-credentials)).
-2. **Join the competition**. Visit
-   <https://www.kaggle.com/c/ieee-fraud-detection> and accept the rules, or
-   the download will 403.
+   (see [Kaggle API credentials](https://github.com/Kaggle/kaggle-api#api-credentials)),
+   then `chmod 600 ~/.kaggle/kaggle.json` to silence the permissions warning.
+2. **You must join the competition.** Visit
+   <https://www.kaggle.com/c/ieee-fraud-detection> and click "Late Submission" /
+   "Join Competition" to accept the rules. Without this,
+   `prepare_data.py` will fail with `403 Forbidden` from the Kaggle API —
+   this is the single most common first-time friction.
 3. **Weco API key** (free tier is fine). See the
    [Weco docs](https://docs.weco.ai).
 
@@ -27,6 +30,15 @@ after 200 steps with `gemini-3.1-pro-preview` and the instructions in
 
 ```bash
 cd examples/fraud-detection
+
+# Virtualenv is strongly recommended — modern Python installs (Debian/Ubuntu,
+# recent Homebrew) refuse `pip install` to the system site-packages under
+# PEP 668. If you skip this step you'll hit
+# `error: externally-managed-environment`.
+python3 -m venv .venv
+source .venv/bin/activate     # Windows: .venv\Scripts\activate
+# After activation, `python` resolves to the venv's interpreter.
+
 pip install -r requirements.txt
 
 # Downloads ~120MB of CSVs, builds a small 100K/25K parquet split.
diff --git a/examples/fraud-detection/prepare_data.py b/examples/fraud-detection/prepare_data.py
index f9ab9c7..f538b9d 100644
--- a/examples/fraud-detection/prepare_data.py
+++ b/examples/fraud-detection/prepare_data.py
@@ -42,10 +42,25 @@ def download_kaggle() -> None:
         return
 
     print(f"[download] kaggle competitions download -c ieee-fraud-detection -p {DATA_DIR}")
-    subprocess.check_call(
-        ["kaggle", "competitions", "download", "-c", "ieee-fraud-detection",
-         "-p", str(DATA_DIR)]
-    )
+    print("[download] this takes ~1-2 min over a fast link; ~120MB of CSVs")
+    # Use `python -m kaggle.cli` — the `kaggle` package has no __main__, so
+    # `python -m kaggle` fails. kaggle.cli is the canonical entry point.
+    try:
+        subprocess.check_call(
+            [sys.executable, "-m", "kaggle.cli", "competitions", "download",
+             "-c", "ieee-fraud-detection", "-p", str(DATA_DIR)]
+        )
+    except subprocess.CalledProcessError as e:
+        print(
+            "\n[error] Kaggle download failed. Most common causes:\n"
+            "  1. You haven't joined the competition. Visit\n"
+            "     https://www.kaggle.com/c/ieee-fraud-detection\n"
+            "     and click 'Late Submission' / 'Join Competition' to accept the rules.\n"
+            "  2. ~/.kaggle/kaggle.json is missing or has wrong permissions.\n"
+            "     Run: chmod 600 ~/.kaggle/kaggle.json\n",
+            file=sys.stderr,
+        )
+        raise SystemExit(e.returncode)
     zip_path = DATA_DIR / "ieee-fraud-detection.zip"
     print(f"[extract] {zip_path}")
     with zipfile.ZipFile(zip_path) as zf:

From 85c58b4ecca347128d4eb2d96b7bf5822131c191 Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Thu, 23 Apr 2026 16:41:41 +0100
Subject: [PATCH 3/8] examples/fraud-detection: fix time leakage in baseline
 (Codex review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex flagged that the baseline concatenates train + val before computing
groupby aggregations and frequency encodings, letting val-period
distribution shape train features and letting each val row influence its
own encoded values. Even with isFraud dropped first, this is time-leakage
that inflates val AUC vs. what would be seen at serving time.

Fix: compute all encoders (card1/addr1 amount stats, frequency encoding)
on train_df only; .join/.map onto both splits; fill unseen val keys with
train-global defaults. Refactored per-row features (time, amount) into a
small helper so both splits share that code path without concat.

Baseline AUC drops from the previously-reported 0.914 to 0.910 — the
right number, not artificially inflated. Expected Weco trajectory (0.928-
0.933 at 200 steps with full instructions) unchanged in shape; case study
absolute numbers used the leaky baseline so they shift slightly here.

Also expanded instructions.md and README to distinguish target leakage
(isFraud in the dataframe during aggregation) from time leakage (val
distribution in the encoder fit), with the fit-on-train / apply-to-both
pattern spelled out for future encoders Weco proposes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/fraud-detection/README.md       |  59 ++++++++-----
 examples/fraud-detection/instructions.md |  21 ++++-
 examples/fraud-detection/train.py        | 102 +++++++++++++++--------
 3 files changed, 122 insertions(+), 60 deletions(-)

diff --git a/examples/fraud-detection/README.md b/examples/fraud-detection/README.md
index 5e61f06..e85cb27 100644
--- a/examples/fraud-detection/README.md
+++ b/examples/fraud-detection/README.md
@@ -8,10 +8,12 @@ held-out, time-based validation split.
 
 This example reproduces the setup from Weco's fraud-detection case study
 ([blog post](https://weco.ai/blog/framing-the-problem),
-[code](https://github.com/WecoAI/fraud-detection-case-study)). Expected
-improvement: **baseline ≈ 0.914 → full-pipeline pooled mean 0.9305 ± 0.0035**
-after 200 steps with `gemini-3.1-pro-preview` and the instructions in
-`instructions.md`.
+[code](https://github.com/WecoAI/fraud-detection-case-study)). The example's
+baseline is **AUC ≈ 0.910** — a few points below the 0.914 reported in the
+case study because this example fits all encoders on `train_df` only
+(no time-leakage into val features). With the bundled `instructions.md`
+and 200 steps of `gemini-3.1-pro-preview`, expect AUC in the **0.928–0.933**
+range, consistent with the case study trajectory on a clean baseline.
 
 ## Prerequisites
 
@@ -62,10 +64,10 @@ Run the baseline once to confirm everything loads:
 
 ```bash
 python evaluate.py
-# → auc_roc: 0.914xxx   (takes ~30s)
+# → auc_roc: 0.910xxx   (takes ~30s)
 ```
 
-If you see an AUC in the 0.91-0.92 range, you're ready.
+If you see an AUC in the 0.90-0.92 range, you're ready.
 
 ## Run Weco
 
@@ -125,28 +127,39 @@ Expected trajectory:
    `features.py` + `model.py`). In our case study, features-only delivered
    most of the improvement that full-pipeline did.
 
-## Watch out for silent target leakage
+## Watch out for silent leakage
 
-IEEE-CIS is a known trap for automated optimizers. A plausible idea like
-"count how many columns are zero per row" becomes leaky if the dataframe
-still contains `isFraud`, because fraud rows contribute a different count
-than non-fraud rows. The `build_features` in `train.py` drops `isFraud` and
-`TransactionID` before any cross-column aggregation — don't let proposals
-reintroduce aggregations on a dataframe that still contains the label.
+Two flavors both show up in IEEE-CIS optimization runs.
 
-Signs to check for when a run reports a surprisingly high AUC (> 0.95 on this
-subsample):
+**Target leakage** — `isFraud` ends up encoded into features. A plausible
+idea like "count how many columns are zero per row" becomes leaky if the
+dataframe still contains `isFraud`, because fraud rows contribute a
+different count than non-fraud rows. The baseline `build_features` drops
+`isFraud` and `TransactionID` up-front; don't let proposals reintroduce
+aggregations on a dataframe that still has the label. The case study walks
+through a real instance where this bug reported AUC 0.9591 that dropped to
+0.9154 after a one-line fix — see
+<https://weco.ai/blog/framing-the-problem>.
+
+**Time leakage** — validation-period statistics leak into train features.
+This is a time-based split; at serving time you don't have the val period.
+Any encoder, groupby aggregation, frequency count, or target encoding must
+be **fit on `train_df` only** and then applied to both splits. The baseline
+demonstrates the pattern — fit `card1_amt_mean` on train, `.join` it onto
+both train and val, fill unseen val keys with a train-global default. If a
+proposal does `pd.concat([train_df, val_df]).groupby(...)`, that's a leak
+even if it drops `isFraud` first.
+
+Signs a run has one of these leaks (AUC suspiciously high on this 100K/25K
+subsample, e.g. > 0.95):
 
 - Any `df.sum`/`df.mean`/`(df == x)` across all columns before the label is
   dropped.
-- Target encoding without out-of-fold protection (encoder fit on train + val
-  concat).
-- Features computed using validation data (time-leakage: using `val_df` in
-  `train`'s feature-engineering step).
-
-The case study walks through a real instance where an uninstructed run
-reported AUC 0.9591 that dropped to 0.9154 after a one-line fix. See
-<https://weco.ai/blog/framing-the-problem>.
+- Target encoding without out-of-fold protection (encoder fit on full train
+  then applied to train).
+- Groupby / value-counts / target encoders fit on `pd.concat([train, val])`.
+- Features computed using validation data at all — velocity features that
+  sort train + val together and take row-wise diffs, etc.
 
 ## Citing the case study
 
diff --git a/examples/fraud-detection/instructions.md b/examples/fraud-detection/instructions.md
index cfef410..a57ac47 100644
--- a/examples/fraud-detection/instructions.md
+++ b/examples/fraud-detection/instructions.md
@@ -95,5 +95,22 @@ Count of NaN values across D-columns, M-columns, V-columns per row. Sum/mean of
 - Categoricals are already integer-encoded — treat them as numeric
 - Keep the `run_pipeline() -> float` function signature and the `auc_roc: 0.xxxxxx` print format intact
 
-## Avoiding silent target leakage
-`isFraud` is the label. If you compute features that aggregate across all columns of the dataframe (e.g. `(df == 0).sum(axis=1)`, row-wise NaN counts over the entire frame), drop `isFraud` and `TransactionID` first. Otherwise the label signal bleeds into the features and produces implausibly high AUC (>0.95) that collapses the moment the fix is applied. Target encoding must use out-of-fold protection: compute encoding on train folds only, never on the full train + val concat.
+## Avoiding silent leakage
+
+Two distinct leaks to avoid. Both inflate reported AUC without improving the real pipeline.
+
+**1. Target leakage (isFraud bleeding into features).** `isFraud` is the label. If you compute features that aggregate across all columns of the dataframe (e.g. `(df == 0).sum(axis=1)`, row-wise NaN counts over the entire frame), drop `isFraud` and `TransactionID` first. Otherwise the label signal encodes into the features and produces implausibly high AUC (> 0.95) that collapses the moment the fix is applied.
+
+**2. Time leakage (validation distribution bleeding into features).** This is a time-based train/val split — val rows are transactions from a later period you wouldn't see at serving time. Any encoder, aggregation, frequency count, or target encoding MUST be fit on `train_df` only and then applied to both splits. Concatenating `train_df + val_df` before a `groupby` lets val-period statistics shape train features and lets each val row influence its own encoded values. Expected fallout: smaller inflation than target leakage, but still material (noticeable bump in val AUC that doesn't survive a real time cutoff).
+
+Pattern to follow for any new group/frequency/target encoder:
+
+```python
+# Fit on train
+freq = train_df[col].value_counts(normalize=True)
+# Apply to both, unseen keys get 0 (or a sensible train-global default)
+train_df[f"{col}_freq"] = train_df[col].map(freq).fillna(0)
+val_df[f"{col}_freq"] = val_df[col].map(freq).fillna(0)
+```
+
+For target encoding specifically, even on train you need out-of-fold protection (fit encoder on K-1 folds, apply to the held-out fold) — otherwise you leak train labels into train features.
diff --git a/examples/fraud-detection/train.py b/examples/fraud-detection/train.py
index 7e3faa8..10e1c46 100644
--- a/examples/fraud-detection/train.py
+++ b/examples/fraud-detection/train.py
@@ -15,54 +15,86 @@
 from sklearn.metrics import roc_auc_score
 
 
-def build_features(
-    train_df: pd.DataFrame, val_df: pd.DataFrame
-) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-    """Build features from the base data. Returns (X_train, y_train, X_val, y_val).
-
-    This is a small starting set. Weco can replace or extend it — the case study
-    found UID-based aggregations (card1 + addr1 + account-creation-day estimate),
-    target encoding with out-of-fold protection, frequency encoding, and velocity
-    features are the most impactful additions.
-    """
-    y_train = train_df["isFraud"].values.astype(np.int32)
-    y_val = val_df["isFraud"].values.astype(np.int32)
-
-    n_train = len(train_df)
-    df = pd.concat([train_df, val_df], axis=0, ignore_index=True)
-
-    # Drop the label BEFORE any cross-column aggregation to avoid target leakage.
-    df = df.drop(columns=["isFraud", "TransactionID"])
-
-    # --- Time features from TransactionDT (seconds offset from a reference date) ---
+def _add_row_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Per-row features that don't depend on any other row (safe to compute anywhere)."""
+    df = df.copy()
     df["hour"] = (df["TransactionDT"] // 3600) % 24
     df["day_of_week"] = (df["TransactionDT"] // 86400) % 7
     df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
     df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
 
-    # --- Amount features ---
     df["TransactionAmt_log"] = np.log1p(df["TransactionAmt"])
     df["TransactionAmt_decimal"] = (
         df["TransactionAmt"] - df["TransactionAmt"].astype(int)
     ).round(2)
     df["TransactionAmt_is_round"] = (df["TransactionAmt_decimal"] == 0).astype(np.int8)
+    return df
 
-    # --- Simple aggregations on card1 / addr1 ---
-    for key in ["card1", "addr1"]:
-        grp = df.groupby(key)["TransactionAmt"]
-        df[f"{key}_amt_mean"] = grp.transform("mean")
-        df[f"{key}_amt_std"] = grp.transform("std").fillna(0)
-        df[f"{key}_amt_count"] = grp.transform("count")
 
-    # --- Frequency encoding for high-cardinality categoricals ---
-    for col in ["card1", "card2", "card5", "addr1"]:
-        if col in df.columns:
-            freq = df[col].value_counts(normalize=True)
-            df[f"{col}_freq"] = df[col].map(freq).fillna(0)
+def build_features(
+    train_df: pd.DataFrame, val_df: pd.DataFrame
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Build features from the base data. Returns (X_train, y_train, X_val, y_val).
+
+    Any aggregation or encoding (groupby stats, frequency, target encoding, ...)
+    is fit on `train_df` ONLY and applied to both train and val. This mirrors
+    production: at serving time you do not have the validation period yet, so
+    letting val rows shape the features is time-leakage that inflates the AUC
+    Weco optimizes against.
+
+    Weco can replace or extend this — the case study found UID-based
+    aggregations (card1 + addr1 + account-creation-day estimate), target
+    encoding with out-of-fold protection, frequency encoding, and velocity
+    features are the most impactful additions. Keep the fit-on-train /
+    apply-to-both discipline for any new encoder.
+    """
+    y_train = train_df["isFraud"].values.astype(np.int32)
+    y_val = val_df["isFraud"].values.astype(np.int32)
+
+    # Drop label/ID from a copy of each split so no downstream aggregation can
+    # accidentally include them.
+    train = train_df.drop(columns=["isFraud", "TransactionID"])
+    val = val_df.drop(columns=["isFraud", "TransactionID"])
 
-    df = df.drop(columns=["TransactionDT"])
-    X = df.values.astype(np.float32)
-    return X[:n_train], y_train, X[n_train:], y_val
+    train = _add_row_features(train)
+    val = _add_row_features(val)
+
+    # --- Aggregations on card1 / addr1 (fit on train, apply to both) ---
+    for key in ["card1", "addr1"]:
+        grp = train.groupby(key)["TransactionAmt"]
+        stats = grp.agg(["mean", "std", "count"]).rename(
+            columns={"mean": f"{key}_amt_mean",
+                     "std": f"{key}_amt_std",
+                     "count": f"{key}_amt_count"}
+        )
+        # Unseen keys in val: fall back to train-global mean/std and count=0.
+        defaults = {
+            f"{key}_amt_mean": train["TransactionAmt"].mean(),
+            f"{key}_amt_std": train["TransactionAmt"].std(),
+            f"{key}_amt_count": 0,
+        }
+        train = train.join(stats, on=key)
+        val = val.join(stats, on=key)
+        for col, default in defaults.items():
+            train[col] = train[col].fillna(default)
+            val[col] = val[col].fillna(default)
+
+    # --- Frequency encoding (fit on train, apply to both; unseen = 0) ---
+    for col in ["card1", "card2", "card5", "addr1"]:
+        if col not in train.columns:
+            continue
+        freq = train[col].value_counts(normalize=True)
+        train[f"{col}_freq"] = train[col].map(freq).fillna(0)
+        val[f"{col}_freq"] = val[col].map(freq).fillna(0)
+
+    train = train.drop(columns=["TransactionDT"])
+    val = val.drop(columns=["TransactionDT"])
+    # Align columns in case defaults introduced divergent dtypes.
+    val = val[train.columns]
+
+    X_train = train.values.astype(np.float32)
+    X_val = val.values.astype(np.float32)
+    return X_train, y_train, X_val, y_val
 
 
 def train_and_evaluate(

From 381f18411beac4ae89084cfe0dc2caece0d41840 Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Mon, 27 Apr 2026 11:26:00 +0100
Subject: [PATCH 4/8] examples/fraud-detection: reproduce the case-study
 parquets byte-exactly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous prepare_data.py used pandas df.sample(random_state=42), which
produced parquets with shape, fraud rate, and DT range matching the original
case-study but DIFFERENT row content — baseline AUC came out at 0.9023
instead of the case-study's 0.9102. Recovered the original ad-hoc prep
recipe from a Claude Code session transcript and rewrote to match. Two
recipe details that turned out to matter:

1. Stratified train subsample preserving fraud rate, using a single global
   np.random.seed(42) followed by sequential np.random.choice calls (NOT
   pandas df.sample). The val subsample inherits the advanced RNG state.
2. Label-encode using categories from concat(train, val), and include
   "string" alongside "object" in select_dtypes — pandas 3 uses StringDtype
   for string columns and skips them when only "object" is included,
   silently leaving them as raw strings (which would then crash LightGBM
   or be dropped before fit).

Verified locally: re-running this prepare_data.py from a fresh Kaggle
download produces parquets with SHA-256s
  train: a2d7a6740559975b8e6d89bd605f1e29791dd7d3fee8abc6449552bbc18d29ae
  val:   8b426c8bf7fa845bc234dbce304b1107fd295143fac2398bab97b78805f50753
matching the case-study originals exactly. Baseline AUC = 0.910171.

README updated to reflect the now-deterministic 0.9102 baseline (the
previous "0.910 because we removed the leak" gloss was misleading — the
parquets themselves were different from the case-study). Reframed the
0.914 reference as the case-study's leaky-baseline AUC for clarity.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/fraud-detection/README.md       |  13 +-
 examples/fraud-detection/prepare_data.py | 144 ++++++++++++++++-------
 2 files changed, 106 insertions(+), 51 deletions(-)

diff --git a/examples/fraud-detection/README.md b/examples/fraud-detection/README.md
index e85cb27..b74b834 100644
--- a/examples/fraud-detection/README.md
+++ b/examples/fraud-detection/README.md
@@ -9,11 +9,12 @@ held-out, time-based validation split.
 This example reproduces the setup from Weco's fraud-detection case study
 ([blog post](https://weco.ai/blog/framing-the-problem),
 [code](https://github.com/WecoAI/fraud-detection-case-study)). The example's
-baseline is **AUC ≈ 0.910** — a few points below the 0.914 reported in the
-case study because this example fits all encoders on `train_df` only
-(no time-leakage into val features). With the bundled `instructions.md`
-and 200 steps of `gemini-3.1-pro-preview`, expect AUC in the **0.928–0.933**
-range, consistent with the case study trajectory on a clean baseline.
+baseline is **AUC ≈ 0.9102** (deterministic; verifiable via the SHA-256s
+in `prepare_data.py`). The case study reported 0.914, which used a slightly
+leaky `build_features` (concat-then-groupby on train+val); this example's
+`train.py` fits all encoders on `train_df` only — no time-leakage. With the
+bundled `instructions.md` and 200 steps of `gemini-3.1-pro-preview`, expect
+AUC in the **0.928–0.933** range.
 
 ## Prerequisites
 
@@ -64,7 +65,7 @@ Run the baseline once to confirm everything loads:
 
 ```bash
 python evaluate.py
-# → auc_roc: 0.910xxx   (takes ~30s)
+# → auc_roc: 0.910171   (deterministic, takes ~30s)
 ```
 
 If you see an AUC in the 0.90-0.92 range, you're ready.
diff --git a/examples/fraud-detection/prepare_data.py b/examples/fraud-detection/prepare_data.py
index f538b9d..b8da66e 100644
--- a/examples/fraud-detection/prepare_data.py
+++ b/examples/fraud-detection/prepare_data.py
@@ -1,9 +1,8 @@
-"""Download IEEE-CIS data, build base features, subsample to a small split.
+"""Download IEEE-CIS data and build the fixed train/val parquets used by train.py.
 
-Produces `data/base_train_small.parquet` and `data/base_val_small.parquet` that
-`train.py` loads. The split is time-based (the last 20% of transactions by
-TransactionDT are held out for validation), which mirrors production fraud
-detection: you never train on future data.
+Produces `data/base_train_small.parquet` (100K rows, stratified by fraud) and
+`data/base_val_small.parquet` (25K rows, time-later subsample). Identical SHA-256
+to the parquets used in the published case study.
 
 Usage:
     # 1. Put your Kaggle API token at ~/.kaggle/kaggle.json
@@ -13,6 +12,24 @@
     python prepare_data.py
 
 Runtime: ~2-3 minutes on a modern laptop. Produces ~150MB of parquet files.
+
+Pipeline (must stay byte-identical to the originals — see SHAs in the README):
+1. Merge `train_transaction.csv` + `train_identity.csv` on TransactionID.
+2. Time-based 80/20 split on TransactionDT (last 20% by time = validation).
+3. V-feature correlation pruning: sample 10_000 rows from the FULL merged df with
+   `random_state=42`, drop V-cols whose pairwise |corr| > 0.95.
+4. Label-encode all `object`/`string` columns using categories from the
+   `concat(train, val)` dtype, so the same string maps to the same int in both
+   splits.
+5. **Stratified** subsample to 100K train via global `np.random.seed(42)` +
+   `np.random.choice` over fraud/legit indices (preserves the 3.5% fraud rate
+   exactly), and a uniform 25K val subsample drawn from the same RNG state.
+
+Each of these details matters for reproducing the published baseline AUC of
+0.910171. In particular:
+- "object" alone misses pandas-3 string-dtype columns; include "string" too.
+- pandas `df.sample()` and `np.random.seed`+`np.random.choice` give DIFFERENT
+  rows even with the same seed — the original used the latter.
 """
 
 from __future__ import annotations
@@ -28,8 +45,10 @@
 DATA_DIR = Path(__file__).parent / "data"
 TRAIN_SIZE = 100_000
 VAL_SIZE = 25_000
-TIME_SPLIT_FRAC = 0.8  # first 80% of transactions by time = train candidates
+TIME_SPLIT_FRAC = 0.8
 SEED = 42
+V_CORR_SAMPLE = 10_000
+V_CORR_THRESHOLD = 0.95
 
 
 def download_kaggle() -> None:
@@ -68,47 +87,71 @@ def download_kaggle() -> None:
     zip_path.unlink()
 
 
-def build_base_features(df: pd.DataFrame) -> pd.DataFrame:
-    """Minimal, leakage-safe preprocessing so train.py has a clean starting point.
+def time_based_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    split_point = df["TransactionDT"].quantile(TIME_SPLIT_FRAC)
+    train = df[df["TransactionDT"] <= split_point].copy()
+    val = df[df["TransactionDT"] > split_point].copy()
+    return train, val
 
-    - Drop test-specific columns
-    - Label-encode object columns (LightGBM doesn't take strings)
-    - Reduce highly correlated V-features (drop one per cluster with r > 0.95)
-      to keep train.py's input dimensionality manageable
-    """
-    # Label-encode all object columns. Keep isFraud/TransactionID/TransactionDT intact.
-    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
-    for col in obj_cols:
-        df[col] = df[col].astype("category").cat.codes.astype(np.int32)
 
-    # Reduce V-features by correlation clustering (done on a sample for speed).
-    v_cols = [c for c in df.columns if c.startswith("V")]
-    if v_cols:
-        sample = df[v_cols].sample(n=min(10_000, len(df)), random_state=SEED)
-        corr = sample.corr().abs()
-        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
-        to_drop = [c for c in upper.columns if (upper[c] > 0.95).any()]
-        df = df.drop(columns=to_drop)
-        print(f"[v-reduce] dropped {len(to_drop)}/{len(v_cols)} correlated V-features")
+def reduce_v_features(
+    df_full: pd.DataFrame, train: pd.DataFrame, val: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame, list[str]]:
+    """Drop V-cols whose pairwise |corr| > threshold, sampled from FULL merged df."""
+    v_cols = [c for c in df_full.columns if c.startswith("V")]
+    if not v_cols:
+        return train, val, []
+    sample = df_full[v_cols].sample(n=min(V_CORR_SAMPLE, len(df_full)), random_state=SEED)
+    corr = sample.corr().abs()
+    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
+    to_drop = [c for c in upper.columns if (upper[c] > V_CORR_THRESHOLD).any()]
+    return train.drop(columns=to_drop), val.drop(columns=to_drop), to_drop
 
-    return df
 
+def label_encode_with_combined_categories(
+    train: pd.DataFrame, val: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Encode all object/string cols using categories from concat(train, val).
 
-def time_based_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
-    df = df.sort_values("TransactionDT").reset_index(drop=True)
-    split_point = df["TransactionDT"].quantile(TIME_SPLIT_FRAC)
-    train = df[df["TransactionDT"] <= split_point].copy()
-    val = df[df["TransactionDT"] > split_point].copy()
+    Important: include both "object" AND "string" — pandas 3 strings have
+    StringDtype and aren't picked up by `include=["object"]` alone.
+    """
+    obj_cols = train.select_dtypes(include=["object", "string"]).columns
+    obj_cols = [c for c in obj_cols if c not in ("TransactionID", "isFraud")]
+    for col in obj_cols:
+        combined = pd.concat([train[col], val[col]]).astype("category")
+        cats = combined.cat.categories
+        train[col] = train[col].astype("category").cat.set_categories(cats).cat.codes
+        val[col] = val[col].astype("category").cat.set_categories(cats).cat.codes
     return train, val
 
 
-def subsample(df: pd.DataFrame, n: int, label: str) -> pd.DataFrame:
-    if len(df) <= n:
-        return df
-    sampled = df.sample(n=n, random_state=SEED).sort_values("TransactionDT").reset_index(drop=True)
-    fraud_rate = sampled["isFraud"].mean()
-    print(f"[subsample] {label}: {len(df)} -> {len(sampled)} (fraud rate {fraud_rate:.3%})")
-    return sampled
+def stratified_subsample(
+    train: pd.DataFrame, val: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Stratified train subsample preserving fraud rate; uniform val subsample.
+
+    Uses ONE global `np.random.seed(42)` then sequential `np.random.choice`
+    calls — the val subsample inherits the RNG state advanced by the train
+    subsample. This sequential coupling matters for reproducibility.
+    """
+    np.random.seed(SEED)
+    fraud_idx = train[train["isFraud"] == 1].index
+    legit_idx = train[train["isFraud"] == 0].index
+    fraud_rate = len(fraud_idx) / len(train)
+    n_fraud = int(TRAIN_SIZE * fraud_rate)
+    n_legit = TRAIN_SIZE - n_fraud
+    si = np.sort(
+        np.concatenate([
+            np.random.choice(fraud_idx, n_fraud, replace=False),
+            np.random.choice(legit_idx, n_legit, replace=False),
+        ])
+    )
+    train_small = train.loc[si].reset_index(drop=True)
+    val_small = val.iloc[
+        np.random.choice(len(val), VAL_SIZE, replace=False)
+    ].reset_index(drop=True)
+    return train_small, val_small
 
 
 def main() -> None:
@@ -126,19 +169,30 @@ def main() -> None:
     df = txn.merge(ident, on="TransactionID", how="left")
     print(f"[load] shape={df.shape}, fraud rate {df['isFraud'].mean():.3%}")
 
-    df = build_base_features(df)
-
     print("[split] time-based 80/20")
-    train_df, val_df = time_based_split(df)
-    print(f"[split] train={len(train_df)} val={len(val_df)}")
+    train, val = time_based_split(df)
+    print(f"[split] train={len(train)} val={len(val)}")
+
+    print("[v-reduce] correlation pruning over full merged df")
+    train, val, dropped = reduce_v_features(df, train, val)
+    print(f"[v-reduce] dropped {len(dropped)} V cols (threshold {V_CORR_THRESHOLD})")
+
+    print("[encode] label-encode object/string cols using combined categories")
+    train, val = label_encode_with_combined_categories(train, val)
 
-    train_small = subsample(train_df, TRAIN_SIZE, "train")
-    val_small = subsample(val_df, VAL_SIZE, "val")
+    print("[subsample] stratified train, uniform val (np.random.seed=42)")
+    train_small, val_small = stratified_subsample(train, val)
+    print(f"[subsample] train={len(train_small)} (fraud {train_small['isFraud'].mean():.3%}), "
+          f"val={len(val_small)} (fraud {val_small['isFraud'].mean():.3%})")
 
     train_small.to_parquet(train_out, index=False)
     val_small.to_parquet(val_out, index=False)
     print(f"[write] {train_out}")
     print(f"[write] {val_out}")
+    print()
+    print("Expected SHA-256 (matches the published case study parquets):")
+    print("  train: a2d7a6740559975b8e6d89bd605f1e29791dd7d3fee8abc6449552bbc18d29ae")
+    print("  val:   8b426c8bf7fa845bc234dbce304b1107fd295143fac2398bab97b78805f50753")
 
 
 if __name__ == "__main__":

From 284f4929154d3c527ac0d9bfed02c9ae7763068f Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Mon, 27 Apr 2026 22:47:16 +0100
Subject: [PATCH 5/8] examples/fraud-detection: use pip install --upgrade for
 weco-cli
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent CLI versions ship important fixes — most relevant here, 0.3.31
added queue-mode submit recovery (`_recover_queue_suggest`) and a
native `AutoResumePolicy` that together make the transient
`Failed to submit result` race invisible to the user. Anyone with an
older weco in their venv (e.g. operators reusing weco-gpu's pinned
0.3.25) was hitting this race and silently terminating runs short of
their step budget.

Switching the install command to `pip install --upgrade -r requirements.txt`
ensures users picking up this example always get the latest fixes,
regardless of what's pre-installed in their venv. Comment in the README
explains why we never pin weco-cli.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/fraud-detection/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/fraud-detection/README.md b/examples/fraud-detection/README.md
index b74b834..c2d7b44 100644
--- a/examples/fraud-detection/README.md
+++ b/examples/fraud-detection/README.md
@@ -42,7 +42,11 @@ python3 -m venv .venv
 source .venv/bin/activate     # Windows: .venv\Scripts\activate
 # After activation, `python` resolves to the venv's interpreter.
 
-pip install -r requirements.txt
+pip install --upgrade -r requirements.txt
+# Always pull the latest weco-cli — never pin. Recent versions ship important
+# fixes (e.g. 0.3.31 added queue-mode submit recovery that prevents transient
+# network errors from prematurely terminating runs). `--upgrade` ensures you
+# pick those up even if an older weco is already installed in the venv.
 
 # Downloads ~120MB of CSVs, builds a small 100K/25K parquet split.
 # Time-based split: last 20% of transactions by TransactionDT = validation.

From 235813c47278e013e57a7ebc7f08b163bd2ab37f Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Wed, 29 Apr 2026 12:01:39 +0100
Subject: [PATCH 6/8] examples: add fraud-detection (strict fit/transform API),
 rename old to -loose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous fraud-detection example exposed `build_features(train_df, val_df)`
in a single file. The agent could (and frequently did) `pd.concat([train, val])`
and silently introduce time-leakage in encoders. We measured the inflation at
0.001-0.005 AUC depending on parquet contents, and found that prompt-level
"fit on train only" warnings only achieved ~67% compliance across seeds.

The new fraud-detection/ example uses a fit/transform interface:
  features.py: class FeatureBuilder with fit(X_train, y_train) + transform(X)
  model.py:    train_and_evaluate(X_train, y_train, X_val, y_val) -> float
  evaluate.py: frozen orchestrator that strips isFraud, calls fb.fit then
               fb.transform twice, and runs the model.

This kills two classes of leakage at the interface:
- isFraud is dropped before X reaches features.py (target leakage out).
- val data is never visible to fit() (time leakage out).
- transform() has no y argument (val labels can't influence val features).

Weco optimizes:
- features.py and model.py separately for scope=features / scope=model
- both together (`--sources features.py model.py`) for scope=full
The file boundary IS the scope boundary — no leaky helper module needed.

Existing single-file example renamed to fraud-detection-loose/ and kept as
a comparison artifact. README in fraud-detection/ links to it.

Baseline AUC: 0.909132 (deterministic; ~0.001 below the loose version's
0.910171 — that's the leakage inflation in the loose baseline).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/fraud-detection-loose/.gitignore     |   4 +
 examples/fraud-detection-loose/README.md      | 175 ++++++++++++
 examples/fraud-detection-loose/evaluate.py    |  35 +++
 .../fraud-detection-loose/instructions.md     | 116 ++++++++
 .../fraud-detection-loose/prepare_data.py     | 199 ++++++++++++++
 .../fraud-detection-loose/requirements.txt    |   7 +
 .../train.py                                  |   0
 examples/fraud-detection/README.md            | 258 ++++++++----------
 examples/fraud-detection/evaluate.py          |  53 +++-
 examples/fraud-detection/features.py          | 104 +++++++
 examples/fraud-detection/model.py             |  69 +++++
 11 files changed, 869 insertions(+), 151 deletions(-)
 create mode 100644 examples/fraud-detection-loose/.gitignore
 create mode 100644 examples/fraud-detection-loose/README.md
 create mode 100644 examples/fraud-detection-loose/evaluate.py
 create mode 100644 examples/fraud-detection-loose/instructions.md
 create mode 100644 examples/fraud-detection-loose/prepare_data.py
 create mode 100644 examples/fraud-detection-loose/requirements.txt
 rename examples/{fraud-detection => fraud-detection-loose}/train.py (100%)
 create mode 100644 examples/fraud-detection/features.py
 create mode 100644 examples/fraud-detection/model.py

diff --git a/examples/fraud-detection-loose/.gitignore b/examples/fraud-detection-loose/.gitignore
new file mode 100644
index 0000000..60f2536
--- /dev/null
+++ b/examples/fraud-detection-loose/.gitignore
@@ -0,0 +1,4 @@
+data/
+.runs/
+__pycache__/
+*.pyc
diff --git a/examples/fraud-detection-loose/README.md b/examples/fraud-detection-loose/README.md
new file mode 100644
index 0000000..c2d7b44
--- /dev/null
+++ b/examples/fraud-detection-loose/README.md
@@ -0,0 +1,175 @@
+# Fraud Detection (IEEE-CIS)
+
+Optimize a tabular fraud-detection pipeline on the
+[IEEE-CIS Fraud Detection](https://www.kaggle.com/c/ieee-fraud-detection) Kaggle
+dataset (real Vesta payment transactions). Weco rewrites `train.py` — both
+feature engineering and the LightGBM configuration — to maximize AUC-ROC on a
+held-out, time-based validation split.
+
+This example reproduces the setup from Weco's fraud-detection case study
+([blog post](https://weco.ai/blog/framing-the-problem),
+[code](https://github.com/WecoAI/fraud-detection-case-study)). The example's
+baseline is **AUC ≈ 0.9102** (deterministic; verifiable via the SHA-256s
+in `prepare_data.py`). The case study reported 0.914, which used a slightly
+leaky `build_features` (concat-then-groupby on train+val); this example's
+`train.py` fits all encoders on `train_df` only — no time-leakage. With the
+bundled `instructions.md` and 200 steps of `gemini-3.1-pro-preview`, expect
+AUC in the **0.928–0.933** range.
+
+## Prerequisites
+
+1. **Kaggle API token**. Put a valid `kaggle.json` at `~/.kaggle/kaggle.json`
+   (see [Kaggle API credentials](https://github.com/Kaggle/kaggle-api#api-credentials)),
+   then `chmod 600 ~/.kaggle/kaggle.json` to silence the permissions warning.
+2. **You must join the competition.** Visit
+   <https://www.kaggle.com/c/ieee-fraud-detection> and click "Late Submission" /
+   "Join Competition" to accept the rules. Without this,
+   `prepare_data.py` will fail with `403 Forbidden` from the Kaggle API —
+   this is the single most common first-time friction.
+3. **Weco API key** (free tier is fine). See the
+   [Weco docs](https://docs.weco.ai).
+
+## Setup
+
+```bash
+cd examples/fraud-detection
+
+# Virtualenv is strongly recommended — modern Python installs (Debian/Ubuntu,
+# recent Homebrew) refuse `pip install` to the system site-packages under
+# PEP 668. If you skip this step you'll hit
+# `error: externally-managed-environment`.
+python3 -m venv .venv
+source .venv/bin/activate     # Windows: .venv\Scripts\activate
+# After activation, `python` resolves to the venv's interpreter.
+
+pip install --upgrade -r requirements.txt
+# Always pull the latest weco-cli — never pin. Recent versions ship important
+# fixes (e.g. 0.3.31 added queue-mode submit recovery that prevents transient
+# network errors from prematurely terminating runs). `--upgrade` ensures you
+# pick those up even if an older weco is already installed in the venv.
+
+# Downloads ~120MB of CSVs, builds a small 100K/25K parquet split.
+# Time-based split: last 20% of transactions by TransactionDT = validation.
+# ~2-3 minutes on a modern laptop.
+python prepare_data.py
+```
+
+After this you should have:
+
+```
+data/
+  train_transaction.csv, train_identity.csv, test_*.csv  # raw
+  base_train_small.parquet   # 100K rows, time-ordered
+  base_val_small.parquet     # 25K rows, later in time
+```
+
+## Quick sanity check
+
+Run the baseline once to confirm everything loads:
+
+```bash
+python evaluate.py
+# → auc_roc: 0.910171   (deterministic, takes ~30s)
+```
+
+If you see an AUC in the 0.90-0.92 range, you're ready.
+
+## Run Weco
+
+The "default" run uses the full EDA + techniques instructions (recommended —
+they contain the column semantics and known-good techniques for this dataset):
+
+```bash
+weco run --source train.py \
+     --eval-command "python evaluate.py" \
+     --metric auc_roc \
+     --goal maximize \
+     --steps 50 \
+     --model gemini-3.1-pro-preview \
+     --additional-instructions instructions.md \
+     --eval-timeout 300 \
+     --log-dir .runs/fraud-detection
+```
+
+Expected trajectory:
+
+- Steps 1–10: Weco explores — tries log-amount, simple aggregations, category
+  encodings. AUC moves into 0.918-0.925.
+- Steps 10–50: builds UID-style features (card1 + addr1 + account-creation
+  estimate via `D1`), target encoding with out-of-fold protection, velocity
+  features. AUC climbs to 0.928-0.933.
+- Beyond step 50: diminishing returns; the pooled mean across 6 seeds in our
+  case study was 0.9305 ± 0.0035.
+
+## Explanation
+
+- `--source train.py` — the file Weco rewrites. Both `build_features` and
+  `train_and_evaluate` are fair game.
+- `--eval-command "python evaluate.py"` — called after every proposed edit;
+  reimports `train.py`, runs the pipeline, prints `auc_roc: 0.xxxxxx`. Weco
+  parses the last line matching `--metric`.
+- `--metric auc_roc --goal maximize` — Weco optimizes the metric printed by
+  the evaluator.
+- `--additional-instructions instructions.md` — injects domain context into
+  every optimization step. **This is what mostly matters.** See the
+  case study: EDA-level instructions (what each column means in this
+  specific dataset) drive most of the gain. Kaggle-classic techniques are
+  typically already in the LLM's pretraining distribution. Feed the optimizer
+  what it couldn't already know — dataset-specific semantics, proprietary
+  heuristics, internal constraints.
+- `--eval-timeout 300` — one eval takes ~30-60s; 300s gives headroom for
+  feature-heavy proposals.
+
+## Things to try
+
+1. **No instructions baseline**: remove `--additional-instructions` and watch
+   variance across seeds balloon (std ~0.008 vs ~0.002 with instructions).
+   Also watch for silently-leaky proposals (see below).
+2. **EDA only**: keep only the column-meaning section of `instructions.md` —
+   the case study found this accounts for most of the mean gain.
+3. **Scope restriction**: point Weco at `train.py`'s `build_features` only by
+   editing the file to expose just that function (or split the pipeline into
+   `features.py` + `model.py`). In our case study, features-only delivered
+   most of the improvement that full-pipeline did.
+
+## Watch out for silent leakage
+
+Two flavors both show up in IEEE-CIS optimization runs.
+
+**Target leakage** — `isFraud` ends up encoded into features. A plausible
+idea like "count how many columns are zero per row" becomes leaky if the
+dataframe still contains `isFraud`, because fraud rows contribute a
+different count than non-fraud rows. The baseline `build_features` drops
+`isFraud` and `TransactionID` up-front; don't let proposals reintroduce
+aggregations on a dataframe that still has the label. The case study walks
+through a real instance where this bug reported AUC 0.9591 that dropped to
+0.9154 after a one-line fix — see
+<https://weco.ai/blog/framing-the-problem>.
+
+**Time leakage** — validation-period statistics leak into train features.
+This is a time-based split; at serving time you don't have the val period.
+Any encoder, groupby aggregation, frequency count, or target encoding must
+be **fit on `train_df` only** and then applied to both splits. The baseline
+demonstrates the pattern — fit `card1_amt_mean` on train, `.join` it onto
+both train and val, fill unseen val keys with a train-global default. If a
+proposal does `pd.concat([train_df, val_df]).groupby(...)`, that's a leak
+even if it drops `isFraud` first.
+
+Signs a run has one of these leaks (AUC suspiciously high on this 100K/25K
+subsample, e.g. > 0.95):
+
+- Any `df.sum`/`df.mean`/`(df == x)` across all columns before the label is
+  dropped.
+- Target encoding without out-of-fold protection (encoder fit on full train
+  then applied to train).
+- Groupby / value-counts / target encoders fit on `pd.concat([train, val])`.
+- Features computed using validation data at all — velocity features that
+  sort train + val together and take row-wise diffs, etc.
+
+## Citing the case study
+
+If you use this example, the underlying numbers come from
+<https://github.com/WecoAI/fraud-detection-case-study>. Setup: 200 steps,
+3 seeds per condition (6 for the Full pipeline + Full-instructions condition,
+pooled since the two ablations share that configuration),
+`gemini-3.1-pro-preview`.
diff --git a/examples/fraud-detection-loose/evaluate.py b/examples/fraud-detection-loose/evaluate.py
new file mode 100644
index 0000000..d3ad2d6
--- /dev/null
+++ b/examples/fraud-detection-loose/evaluate.py
@@ -0,0 +1,35 @@
+"""Evaluator Weco calls after each proposed edit.
+
+Loads train.py fresh each run (Weco rewrites it in place), executes the
+pipeline, and prints a single `auc_roc: 0.xxxxxx` line that Weco parses as
+the metric.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+
+
+def load_module(path: str):
+    spec = importlib.util.spec_from_file_location("train_under_test", path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def main() -> int:
+    train = load_module(str(Path(__file__).parent / "train.py"))
+    auc = train.run_pipeline()
+
+    if not (0.0 <= auc <= 1.0):
+        print(f"Constraint violated: AUC-ROC out of range ({auc})")
+        return 1
+
+    print(f"auc_roc: {auc:.6f}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/fraud-detection-loose/instructions.md b/examples/fraud-detection-loose/instructions.md
new file mode 100644
index 0000000..a57ac47
--- /dev/null
+++ b/examples/fraud-detection-loose/instructions.md
@@ -0,0 +1,116 @@
+# Fraud Detection Optimization Instructions
+
+## Task
+Optimize `train.py` to maximize AUC-ROC for fraud detection on the IEEE-CIS dataset. You may modify both `build_features` (feature engineering) and `train_and_evaluate` (model config). Keep `run_pipeline`'s interface and the `auc_roc: 0.xxxxxx` print format unchanged so the evaluator can parse the metric.
+
+## Dataset Details
+- 100K train / 25K val, 3.5% fraud rate, time-based split
+- Base data has 297 columns after V-feature correlation pruning
+- Categoricals are already label-encoded as integers
+- TransactionDT is in seconds (timedelta from reference date, NOT a timestamp)
+
+## Column Meanings (from Kaggle community reverse-engineering)
+
+### Raw columns
+- **TransactionAmt**: USD amount. Heavy-tailed (median $68, max $4578). Log transform essential.
+- **ProductCD**: Product type (5 categories: C, H, R, S, W). Each has a distinct V-feature NaN pattern and fraud rate (C=11%, W=2.1%).
+- **card1**: Bank Identification Number (BIN) — first 6 digits of card. Top-3 importance.
+- **card2**: Additional card info. 1.5% NaN. Top-3 importance.
+- **card3/card5**: Card country/product type codes.
+- **card4**: Card network (visa, mastercard, etc).
+- **card6**: Card type (credit, debit).
+- **addr1**: Billing zip code (anonymized). 11.5% NaN.
+- **addr2**: Billing country.
+- **P_emaildomain**: Purchaser email domain (gmail.com, yahoo.com, etc).
+- **R_emaildomain**: Recipient email domain. Mismatch between P and R = fraud signal.
+- **dist1/dist2**: Distance features.
+
+### C-features (C1-C14): Entity occurrence COUNTS, no NaN
+- **C1** (importance rank #2): Count of addresses associated with the payment card
+- **C2**: Count of cards at the billing address
+- **C5**: Count of email addresses seen with this card
+- **C11**: Count of cards associated with a user identity
+- **C12**: Count of addresses associated with a user identity
+- **C13** (importance rank #4): Count of distinct email domains per entity — **one of the single most predictive raw features**. High values = fraud ring.
+- **C14** (importance rank #3): Related count feature
+
+### D-features (D1-D15): TIMEDELTA in days between events
+- **D1** (0.2% NaN, median 1 day): Days since last transaction. Most important D-feature. `TransactionDT/86400 - D1` estimates the **account creation date** — this is the key insight for UID construction.
+- **D2** (49% NaN, median 97 days): Days since card was first associated with the identity
+- **D3** (46% NaN): Days since last similar transaction
+- **D4** (29.5% NaN): Days since email association
+- **D10** (14% NaN): Days since last device-linked transaction
+- **D11** (52% NaN): Days since account was opened / account age
+- **D15** (16.5% NaN, median 46 days): Days since last transaction (alternative)
+- D-feature NaN rates themselves are informative — missingness patterns encode transaction type
+
+### M-features (M1-M9): Binary MATCH indicators
+Whether certain attributes match each other (name↔address, card↔billing, device↔historical, etc). Sum of True values, count of NaN, and the M-vector signature are all useful.
+
+### V-features (V1-V339, ~202 after pruning): Vesta-engineered risk signals
+Grouped by ProductCD — each product type uses a different subset of V-features (others are NaN). V258 is the #1 most important feature overall (gain=16703). Other important V-features: V283, V69, V130, V307, V294, V201.
+
+## Top Winning Techniques (from 1st-3rd place solutions)
+
+### 1. UID Construction (THE most impactful single technique)
+```python
+D1_start = floor(TransactionDT / 86400 - D1)  # estimated account creation day
+uid = card1 + "_" + addr1 + "_" + D1_start
+```
+This creates a stable user fingerprint. All aggregation features should be computed on this UID.
+
+### 2. UID-level aggregation features
+For each UID, compute: mean, std, count of TransactionAmt. Then z-score and ratio for each transaction relative to user's history. This captures "is this transaction unusual for this user?"
+
+### 3. Temporal centroid distance
+Compute the user's typical time-of-day using cyclical hour_sin/hour_cos means. The Euclidean distance of the current transaction from the centroid = "is this at an unusual time for this user?"
+
+### 4. D-feature lifecycle lags
+D1 - D2, D1 - D4, D1 - D10, D1 - D15: Inconsistencies between these timestamps indicate synthetic identities or account takeovers.
+
+### 5. Velocity features (sort by [uid, TransactionDT])
+Time since last transaction per user. Amount change from previous transaction. High velocity + high amount = fraud signal.
+
+### 6. Cross-entity cardinality (nunique)
+How many unique addr1 values per card1? How many unique card1 per addr1? How many unique P_emaildomain per uid? High cardinality = suspicious.
+
+### 7. NaN pattern signature
+The binary NaN/not-NaN pattern across D+M columns encodes the transaction type. Compute a bitwise signature or just count NaN per feature group.
+
+### 8. Frequency encoding
+For card1, card2, addr1, P_emaildomain, etc. — map each value to its frequency. Rare values (appearing once or twice) are fraud signals.
+
+### 9. Interaction features
+- amount_zscore × time_distance (unusual amount at unusual time)
+- amount_zscore × C1_ratio (unusual amount with unusual address count)
+- amount / (D1 + 1) = spending rate per day since last transaction
+
+### 10. Row-wise missingness features
+Count of NaN values across D-columns, M-columns, V-columns per row. Sum/mean of M-column values. The NaN pattern encodes the transaction profile.
+
+## Important Constraints
+- Keep code under 300 lines (Weco backend limit)
+- Use n_jobs=4 for any model operations
+- `train.py` loads `data/base_train_small.parquet` and `data/base_val_small.parquet` — don't change these paths
+- Categoricals are already integer-encoded — treat them as numeric
+- Keep the `run_pipeline() -> float` function signature and the `auc_roc: 0.xxxxxx` print format intact
+
+## Avoiding silent leakage
+
+Two distinct leaks to avoid. Both inflate reported AUC without improving the real pipeline.
+
+**1. Target leakage (isFraud bleeding into features).** `isFraud` is the label. If you compute features that aggregate across all columns of the dataframe (e.g. `(df == 0).sum(axis=1)`, row-wise NaN counts over the entire frame), drop `isFraud` and `TransactionID` first. Otherwise the label signal encodes into the features and produces implausibly high AUC (> 0.95) that collapses the moment the fix is applied.
+
+**2. Time leakage (validation distribution bleeding into features).** This is a time-based train/val split — val rows are transactions from a later period you wouldn't see at serving time. Any encoder, aggregation, frequency count, or target encoding MUST be fit on `train_df` only and then applied to both splits. Concatenating `train_df + val_df` before a `groupby` lets val-period statistics shape train features and lets each val row influence its own encoded values. Expected fallout: smaller inflation than target leakage, but still material (noticeable bump in val AUC that doesn't survive a real time cutoff).
+
+Pattern to follow for any new group/frequency/target encoder:
+
+```python
+# Fit on train
+freq = train_df[col].value_counts(normalize=True)
+# Apply to both, unseen keys get 0 (or a sensible train-global default)
+train_df[f"{col}_freq"] = train_df[col].map(freq).fillna(0)
+val_df[f"{col}_freq"] = val_df[col].map(freq).fillna(0)
+```
+
+For target encoding specifically, even on train you need out-of-fold protection (fit encoder on K-1 folds, apply to the held-out fold) — otherwise you leak train labels into train features.
diff --git a/examples/fraud-detection-loose/prepare_data.py b/examples/fraud-detection-loose/prepare_data.py
new file mode 100644
index 0000000..b8da66e
--- /dev/null
+++ b/examples/fraud-detection-loose/prepare_data.py
@@ -0,0 +1,199 @@
+"""Download IEEE-CIS data and build the fixed train/val parquets used by train.py.
+
+Produces `data/base_train_small.parquet` (100K rows, stratified by fraud) and
+`data/base_val_small.parquet` (25K rows, time-later subsample). Identical SHA-256
+to the parquets used in the published case study.
+
+Usage:
+    # 1. Put your Kaggle API token at ~/.kaggle/kaggle.json
+    #    (see https://github.com/Kaggle/kaggle-api#api-credentials)
+    # 2. Join the competition on kaggle.com/c/ieee-fraud-detection to accept rules
+    # 3. Run:
+    python prepare_data.py
+
+Runtime: ~2-3 minutes on a modern laptop. Produces ~150MB of parquet files.
+
+Pipeline (must stay byte-identical to the originals — see SHAs in the README):
+1. Merge `train_transaction.csv` + `train_identity.csv` on TransactionID.
+2. Time-based 80/20 split on TransactionDT (last 20% by time = validation).
+3. V-feature correlation pruning: sample 10_000 rows from the FULL merged df with
+   `random_state=42`, drop V-cols whose pairwise |corr| > 0.95.
+4. Label-encode all `object`/`string` columns using categories from the
+   `concat(train, val)` dtype, so the same string maps to the same int in both
+   splits.
+5. **Stratified** subsample to 100K train via global `np.random.seed(42)` +
+   `np.random.choice` over fraud/legit indices (preserves the 3.5% fraud rate
+   exactly), and a uniform 25K val subsample drawn from the same RNG state.
+
+Each of these details matters for reproducing the published baseline AUC of
+0.910171. In particular:
+- "object" alone misses pandas-3 string-dtype columns; include "string" too.
+- pandas `df.sample()` and `np.random.seed`+`np.random.choice` give DIFFERENT
+  rows even with the same seed — the original used the latter.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+import zipfile
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+DATA_DIR = Path(__file__).parent / "data"
+TRAIN_SIZE = 100_000
+VAL_SIZE = 25_000
+TIME_SPLIT_FRAC = 0.8
+SEED = 42
+V_CORR_SAMPLE = 10_000
+V_CORR_THRESHOLD = 0.95
+
+
+def download_kaggle() -> None:
+    """Download ieee-fraud-detection via the Kaggle CLI."""
+    DATA_DIR.mkdir(exist_ok=True)
+    txn = DATA_DIR / "train_transaction.csv"
+    ident = DATA_DIR / "train_identity.csv"
+    if txn.exists() and ident.exists():
+        print(f"[skip] raw CSVs already present in {DATA_DIR}")
+        return
+
+    print(f"[download] kaggle competitions download -c ieee-fraud-detection -p {DATA_DIR}")
+    print("[download] this takes ~1-2 min over a fast link; ~120MB of CSVs")
+    # Use `python -m kaggle.cli` — the `kaggle` package has no __main__, so
+    # `python -m kaggle` fails. kaggle.cli is the canonical entry point.
+    try:
+        subprocess.check_call(
+            [sys.executable, "-m", "kaggle.cli", "competitions", "download",
+             "-c", "ieee-fraud-detection", "-p", str(DATA_DIR)]
+        )
+    except subprocess.CalledProcessError as e:
+        print(
+            "\n[error] Kaggle download failed. Most common causes:\n"
+            "  1. You haven't joined the competition. Visit\n"
+            "     https://www.kaggle.com/c/ieee-fraud-detection\n"
+            "     and click 'Late Submission' / 'Join Competition' to accept the rules.\n"
+            "  2. ~/.kaggle/kaggle.json is missing or has wrong permissions.\n"
+            "     Run: chmod 600 ~/.kaggle/kaggle.json\n",
+            file=sys.stderr,
+        )
+        raise SystemExit(e.returncode)
+    zip_path = DATA_DIR / "ieee-fraud-detection.zip"
+    print(f"[extract] {zip_path}")
+    with zipfile.ZipFile(zip_path) as zf:
+        zf.extractall(DATA_DIR)
+    zip_path.unlink()
+
+
+def time_based_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    split_point = df["TransactionDT"].quantile(TIME_SPLIT_FRAC)
+    train = df[df["TransactionDT"] <= split_point].copy()
+    val = df[df["TransactionDT"] > split_point].copy()
+    return train, val
+
+
+def reduce_v_features(
+    df_full: pd.DataFrame, train: pd.DataFrame, val: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame, list[str]]:
+    """Drop V-cols whose pairwise |corr| > threshold, sampled from FULL merged df."""
+    v_cols = [c for c in df_full.columns if c.startswith("V")]
+    if not v_cols:
+        return train, val, []
+    sample = df_full[v_cols].sample(n=min(V_CORR_SAMPLE, len(df_full)), random_state=SEED)
+    corr = sample.corr().abs()
+    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
+    to_drop = [c for c in upper.columns if (upper[c] > V_CORR_THRESHOLD).any()]
+    return train.drop(columns=to_drop), val.drop(columns=to_drop), to_drop
+
+
+def label_encode_with_combined_categories(
+    train: pd.DataFrame, val: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Encode all object/string cols using categories from concat(train, val).
+
+    Important: include both "object" AND "string" — pandas 3 strings have
+    StringDtype and aren't picked up by `include=["object"]` alone.
+    """
+    obj_cols = train.select_dtypes(include=["object", "string"]).columns
+    obj_cols = [c for c in obj_cols if c not in ("TransactionID", "isFraud")]
+    for col in obj_cols:
+        combined = pd.concat([train[col], val[col]]).astype("category")
+        cats = combined.cat.categories
+        train[col] = train[col].astype("category").cat.set_categories(cats).cat.codes
+        val[col] = val[col].astype("category").cat.set_categories(cats).cat.codes
+    return train, val
+
+
+def stratified_subsample(
+    train: pd.DataFrame, val: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Stratified train subsample preserving fraud rate; uniform val subsample.
+
+    Uses ONE global `np.random.seed(42)` then sequential `np.random.choice`
+    calls — the val subsample inherits the RNG state advanced by the train
+    subsample. This sequential coupling matters for reproducibility.
+    """
+    np.random.seed(SEED)
+    fraud_idx = train[train["isFraud"] == 1].index
+    legit_idx = train[train["isFraud"] == 0].index
+    fraud_rate = len(fraud_idx) / len(train)
+    n_fraud = int(TRAIN_SIZE * fraud_rate)
+    n_legit = TRAIN_SIZE - n_fraud
+    si = np.sort(
+        np.concatenate([
+            np.random.choice(fraud_idx, n_fraud, replace=False),
+            np.random.choice(legit_idx, n_legit, replace=False),
+        ])
+    )
+    train_small = train.loc[si].reset_index(drop=True)
+    val_small = val.iloc[
+        np.random.choice(len(val), VAL_SIZE, replace=False)
+    ].reset_index(drop=True)
+    return train_small, val_small
+
+
+def main() -> None:
+    download_kaggle()
+
+    train_out = DATA_DIR / "base_train_small.parquet"
+    val_out = DATA_DIR / "base_val_small.parquet"
+    if train_out.exists() and val_out.exists():
+        print(f"[skip] {train_out.name} and {val_out.name} already exist")
+        return
+
+    print("[load] merging train_transaction + train_identity")
+    txn = pd.read_csv(DATA_DIR / "train_transaction.csv")
+    ident = pd.read_csv(DATA_DIR / "train_identity.csv")
+    df = txn.merge(ident, on="TransactionID", how="left")
+    print(f"[load] shape={df.shape}, fraud rate {df['isFraud'].mean():.3%}")
+
+    print("[split] time-based 80/20")
+    train, val = time_based_split(df)
+    print(f"[split] train={len(train)} val={len(val)}")
+
+    print("[v-reduce] correlation pruning over full merged df")
+    train, val, dropped = reduce_v_features(df, train, val)
+    print(f"[v-reduce] dropped {len(dropped)} V cols (threshold {V_CORR_THRESHOLD})")
+
+    print("[encode] label-encode object/string cols using combined categories")
+    train, val = label_encode_with_combined_categories(train, val)
+
+    print("[subsample] stratified train, uniform val (np.random.seed=42)")
+    train_small, val_small = stratified_subsample(train, val)
+    print(f"[subsample] train={len(train_small)} (fraud {train_small['isFraud'].mean():.3%}), "
+          f"val={len(val_small)} (fraud {val_small['isFraud'].mean():.3%})")
+
+    train_small.to_parquet(train_out, index=False)
+    val_small.to_parquet(val_out, index=False)
+    print(f"[write] {train_out}")
+    print(f"[write] {val_out}")
+    print()
+    print("Expected SHA-256 (matches the published case study parquets):")
+    print("  train: a2d7a6740559975b8e6d89bd605f1e29791dd7d3fee8abc6449552bbc18d29ae")
+    print("  val:   8b426c8bf7fa845bc234dbce304b1107fd295143fac2398bab97b78805f50753")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/fraud-detection-loose/requirements.txt b/examples/fraud-detection-loose/requirements.txt
new file mode 100644
index 0000000..188fe96
--- /dev/null
+++ b/examples/fraud-detection-loose/requirements.txt
@@ -0,0 +1,7 @@
+weco
+numpy>=1.24
+pandas>=2.0
+scikit-learn>=1.3
+lightgbm>=4.0
+pyarrow>=13.0
+kaggle>=1.6
diff --git a/examples/fraud-detection/train.py b/examples/fraud-detection-loose/train.py
similarity index 100%
rename from examples/fraud-detection/train.py
rename to examples/fraud-detection-loose/train.py
diff --git a/examples/fraud-detection/README.md b/examples/fraud-detection/README.md
index c2d7b44..a712260 100644
--- a/examples/fraud-detection/README.md
+++ b/examples/fraud-detection/README.md
@@ -1,175 +1,159 @@
-# Fraud Detection (IEEE-CIS)
+# Fraud Detection (IEEE-CIS) — strict fit/transform API
 
 Optimize a tabular fraud-detection pipeline on the
-[IEEE-CIS Fraud Detection](https://www.kaggle.com/c/ieee-fraud-detection) Kaggle
-dataset (real Vesta payment transactions). Weco rewrites `train.py` — both
-feature engineering and the LightGBM configuration — to maximize AUC-ROC on a
-held-out, time-based validation split.
+[IEEE-CIS Fraud Detection](https://www.kaggle.com/c/ieee-fraud-detection)
+Kaggle dataset (real Vesta payment transactions). Weco rewrites two files —
+`features.py` (a `FeatureBuilder` with `fit`/`transform`) and `model.py` (a
+`train_and_evaluate` function) — independently, separately, or together,
+to maximize AUC-ROC on a held-out time-based validation split.
 
-This example reproduces the setup from Weco's fraud-detection case study
+This example reproduces Weco's fraud-detection case study
 ([blog post](https://weco.ai/blog/framing-the-problem),
-[code](https://github.com/WecoAI/fraud-detection-case-study)). The example's
-baseline is **AUC ≈ 0.9102** (deterministic; verifiable via the SHA-256s
-in `prepare_data.py`). The case study reported 0.914, which used a slightly
-leaky `build_features` (concat-then-groupby on train+val); this example's
-`train.py` fits all encoders on `train_df` only — no time-leakage. With the
-bundled `instructions.md` and 200 steps of `gemini-3.1-pro-preview`, expect
-AUC in the **0.928–0.933** range.
+[code](https://github.com/WecoAI/fraud-detection-case-study)) with an
+**API that makes train/val leakage impossible by construction** — see the
+"Why this design" section below.
+
+Baseline AUC: **0.9091** (deterministic; reproducible by running `python evaluate.py`
+after `python prepare_data.py`). With the bundled `instructions.md` and 200
+steps of `gemini-3.1-pro-preview`, expect AUC in the **0.928-0.933** range.
+
+## Layout
+
+```
+features.py     ← Weco edits this for Features-only scope.
+                 Defines FeatureBuilder.fit(X_train, y_train) + transform(X).
+model.py        ← Weco edits this for Model-only scope.
+                 Defines train_and_evaluate(X_train, y_train, X_val, y_val).
+evaluate.py     ← Frozen orchestrator. Loads data, calls fit/transform/train, prints AUC.
+prepare_data.py ← One-off Kaggle download + parquet build. Run once.
+instructions.md ← Domain knowledge prompt for Weco (--additional-instructions).
+```
+
+## Why this design
+
+The original case study had `build_features(train_df, val_df)` in a single
+function — the agent could `pd.concat([train, val])` and silently introduce
+time-leakage. We measured the inflation at 0.001-0.005 AUC, and found that
+even with explicit "fit on train only" warnings in the prompt, Weco's
+proposals frequently reintroduced the leak.
+
+This API kills both leakage flavors at the interface boundary:
+
+| Leakage path | Killed by |
+|---|---|
+| `isFraud` in cross-column aggregations | `evaluate.py` strips `isFraud` before X reaches `FeatureBuilder` |
+| `pd.concat([train_df, val_df])` for groupby/freq | `val_df` is never visible to `fit()` |
+| Val labels at predict time | `transform(X)` has no `y` argument |
+
+Weco can't write the leaky pattern because the leaky symbols literally aren't
+in scope.
 
 ## Prerequisites
 
-1. **Kaggle API token**. Put a valid `kaggle.json` at `~/.kaggle/kaggle.json`
-   (see [Kaggle API credentials](https://github.com/Kaggle/kaggle-api#api-credentials)),
-   then `chmod 600 ~/.kaggle/kaggle.json` to silence the permissions warning.
-2. **You must join the competition.** Visit
-   <https://www.kaggle.com/c/ieee-fraud-detection> and click "Late Submission" /
-   "Join Competition" to accept the rules. Without this,
-   `prepare_data.py` will fail with `403 Forbidden` from the Kaggle API —
-   this is the single most common first-time friction.
-3. **Weco API key** (free tier is fine). See the
-   [Weco docs](https://docs.weco.ai).
+1. **Kaggle API token** at `~/.kaggle/kaggle.json` and
+   `chmod 600 ~/.kaggle/kaggle.json`.
+2. **Join the competition** at <https://www.kaggle.com/c/ieee-fraud-detection>
+   (Late Submission / Join Competition). Without this, `prepare_data.py`
+   gets a 403 from Kaggle.
+3. **Weco API key** — see the [Weco docs](https://docs.weco.ai).
 
 ## Setup
 
 ```bash
 cd examples/fraud-detection
 
-# Virtualenv is strongly recommended — modern Python installs (Debian/Ubuntu,
-# recent Homebrew) refuse `pip install` to the system site-packages under
-# PEP 668. If you skip this step you'll hit
-# `error: externally-managed-environment`.
+# Virtualenv strongly recommended (PEP 668).
 python3 -m venv .venv
-source .venv/bin/activate     # Windows: .venv\Scripts\activate
-# After activation, `python` resolves to the venv's interpreter.
+source .venv/bin/activate
 
 pip install --upgrade -r requirements.txt
-# Always pull the latest weco-cli — never pin. Recent versions ship important
-# fixes (e.g. 0.3.31 added queue-mode submit recovery that prevents transient
-# network errors from prematurely terminating runs). `--upgrade` ensures you
-# pick those up even if an older weco is already installed in the venv.
-
-# Downloads ~120MB of CSVs, builds a small 100K/25K parquet split.
-# Time-based split: last 20% of transactions by TransactionDT = validation.
-# ~2-3 minutes on a modern laptop.
+# --upgrade is intentional: weco-cli ships fixes regularly. 0.3.31 added
+# native auto-resume that fixes a transient submit-failure race that earlier
+# versions hit.
+
+# Download Kaggle data + build a 100K/25K time-based split. ~2-3 min.
 python prepare_data.py
+
+# Sanity check — should print auc_roc: 0.909132 deterministically.
+python evaluate.py
 ```
 
-After this you should have:
+## Run Weco
 
-```
-data/
-  train_transaction.csv, train_identity.csv, test_*.csv  # raw
-  base_train_small.parquet   # 100K rows, time-ordered
-  base_val_small.parquet     # 25K rows, later in time
+Three scope options. Pick one:
+
+### Full pipeline (recommended)
+
+```bash
+weco run \
+    --sources features.py model.py \
+    --eval-command "python evaluate.py" \
+    --metric auc_roc --goal maximize \
+    --steps 200 \
+    --model gemini-3.1-pro-preview \
+    --additional-instructions instructions.md \
+    --eval-timeout 900 \
+    --log-dir .runs/full
 ```
 
-## Quick sanity check
+Weco edits both files. Best AUC across seeds: ~0.929-0.933.
 
-Run the baseline once to confirm everything loads:
+### Features only
 
 ```bash
-python evaluate.py
-# → auc_roc: 0.910171   (deterministic, takes ~30s)
+weco run \
+    --sources features.py \
+    --eval-command "python evaluate.py" \
+    --metric auc_roc --goal maximize \
+    --steps 200 \
+    --model gemini-3.1-pro-preview \
+    --additional-instructions instructions.md \
+    --eval-timeout 900 \
+    --log-dir .runs/features
 ```
 
-If you see an AUC in the 0.90-0.92 range, you're ready.
-
-## Run Weco
+`model.py` stays at its baseline LightGBM. Weco can only improve features.
 
-The "default" run uses the full EDA + techniques instructions (recommended —
-they contain the column semantics and known-good techniques for this dataset):
+### Model only
 
 ```bash
-weco run --source train.py \
-     --eval-command "python evaluate.py" \
-     --metric auc_roc \
-     --goal maximize \
-     --steps 50 \
-     --model gemini-3.1-pro-preview \
-     --additional-instructions instructions.md \
-     --eval-timeout 300 \
-     --log-dir .runs/fraud-detection
+weco run \
+    --sources model.py \
+    --eval-command "python evaluate.py" \
+    --metric auc_roc --goal maximize \
+    --steps 200 \
+    --model gemini-3.1-pro-preview \
+    --additional-instructions instructions.md \
+    --eval-timeout 900 \
+    --log-dir .runs/model
 ```
 
-Expected trajectory:
-
-- Steps 1–10: Weco explores — tries log-amount, simple aggregations, category
-  encodings. AUC moves into 0.918-0.925.
-- Steps 10–50: builds UID-style features (card1 + addr1 + account-creation
-  estimate via `D1`), target encoding with out-of-fold protection, velocity
-  features. AUC climbs to 0.928-0.933.
-- Beyond step 50: diminishing returns; the pooled mean across 6 seeds in our
-  case study was 0.9305 ± 0.0035.
-
-## Explanation
-
-- `--source train.py` — the file Weco rewrites. Both `build_features` and
-  `train_and_evaluate` are fair game.
-- `--eval-command "python evaluate.py"` — called after every proposed edit;
-  reimports `train.py`, runs the pipeline, prints `auc_roc: 0.xxxxxx`. Weco
-  parses the last line matching `--metric`.
-- `--metric auc_roc --goal maximize` — Weco optimizes the metric printed by
-  the evaluator.
-- `--additional-instructions instructions.md` — injects domain context into
-  every optimization step. **This is what mostly matters.** See the
-  case study: EDA-level instructions (what each column means in this
-  specific dataset) drive most of the gain. Kaggle-classic techniques are
-  typically already in the LLM's pretraining distribution. Feed the optimizer
-  what it couldn't already know — dataset-specific semantics, proprietary
-  heuristics, internal constraints.
-- `--eval-timeout 300` — one eval takes ~30-60s; 300s gives headroom for
-  feature-heavy proposals.
+Features are frozen at the baseline `FeatureBuilder`. Weco can only improve
+the model. Headroom is small (~+0.008 AUC) on this task — model tuning isn't
+where the wins live for tabular fraud.
 
 ## Things to try
 
-1. **No instructions baseline**: remove `--additional-instructions` and watch
-   variance across seeds balloon (std ~0.008 vs ~0.002 with instructions).
-   Also watch for silently-leaky proposals (see below).
-2. **EDA only**: keep only the column-meaning section of `instructions.md` —
-   the case study found this accounts for most of the mean gain.
-3. **Scope restriction**: point Weco at `train.py`'s `build_features` only by
-   editing the file to expose just that function (or split the pipeline into
-   `features.py` + `model.py`). In our case study, features-only delivered
-   most of the improvement that full-pipeline did.
-
-## Watch out for silent leakage
-
-Two flavors both show up in IEEE-CIS optimization runs.
-
-**Target leakage** — `isFraud` ends up encoded into features. A plausible
-idea like "count how many columns are zero per row" becomes leaky if the
-dataframe still contains `isFraud`, because fraud rows contribute a
-different count than non-fraud rows. The baseline `build_features` drops
-`isFraud` and `TransactionID` up-front; don't let proposals reintroduce
-aggregations on a dataframe that still has the label. The case study walks
-through a real instance where this bug reported AUC 0.9591 that dropped to
-0.9154 after a one-line fix — see
-<https://weco.ai/blog/framing-the-problem>.
-
-**Time leakage** — validation-period statistics leak into train features.
-This is a time-based split; at serving time you don't have the val period.
-Any encoder, groupby aggregation, frequency count, or target encoding must
-be **fit on `train_df` only** and then applied to both splits. The baseline
-demonstrates the pattern — fit `card1_amt_mean` on train, `.join` it onto
-both train and val, fill unseen val keys with a train-global default. If a
-proposal does `pd.concat([train_df, val_df]).groupby(...)`, that's a leak
-even if it drops `isFraud` first.
-
-Signs a run has one of these leaks (AUC suspiciously high on this 100K/25K
-subsample, e.g. > 0.95):
-
-- Any `df.sum`/`df.mean`/`(df == x)` across all columns before the label is
-  dropped.
-- Target encoding without out-of-fold protection (encoder fit on full train
-  then applied to train).
-- Groupby / value-counts / target encoders fit on `pd.concat([train, val])`.
-- Features computed using validation data at all — velocity features that
-  sort train + val together and take row-wise diffs, etc.
+1. **No instructions** — drop `--additional-instructions`. Watch variance
+   across seeds balloon (~3-5×). Watch for proposed code that leaks
+   train/val statistics inside `fit()` even though the interface tries to
+   prevent it (it's harder, but still possible if `fit` calls into shared
+   helpers — the API keeps Weco honest at the boundary, not deep inside).
+2. **EDA-only vs Tech-only** — split `instructions.md` into two prompts.
+   The case study found EDA (column meanings) drives most of the gain;
+   technique listings (UID construction, target encoding, etc.) are mostly
+   already in the LLM's pretraining and add little.
+3. **Disable auto-resume** — pass `--no-auto-resume` to see what transient
+   failures look like without 0.3.31's recovery.
 
 ## Citing the case study
 
-If you use this example, the underlying numbers come from
-<https://github.com/WecoAI/fraud-detection-case-study>. Setup: 200 steps,
-3 seeds per condition (6 for the Full pipeline + Full-instructions condition,
-pooled since the two ablations share that configuration),
-`gemini-3.1-pro-preview`.
+Numbers come from <https://github.com/WecoAI/fraud-detection-case-study>.
+Setup: 200 steps, 3 seeds per condition, `gemini-3.1-pro-preview`. Strict-API
+rerun on a clean leakage-safe baseline (this example).
+
+## See also
+
+- `examples/fraud-detection-loose/` — earlier single-file API (`train.py` with
+  `build_features(train_df, val_df)`). Kept for comparison; not recommended
+  for new work because it admits time-leakage.
diff --git a/examples/fraud-detection/evaluate.py b/examples/fraud-detection/evaluate.py
index d3ad2d6..0a51825 100644
--- a/examples/fraud-detection/evaluate.py
+++ b/examples/fraud-detection/evaluate.py
@@ -1,30 +1,55 @@
-"""Evaluator Weco calls after each proposed edit.
+"""Evaluator — FROZEN. Loads data, runs FeatureBuilder + train_and_evaluate,
+prints `auc_roc: 0.xxxxxx`.
 
-Loads train.py fresh each run (Weco rewrites it in place), executes the
-pipeline, and prints a single `auc_roc: 0.xxxxxx` line that Weco parses as
-the metric.
+This file is the API enforcement boundary. Weco never edits it.
+
+The interface contract this file enforces:
+- isFraud and TransactionID are stripped before X reaches FeatureBuilder.
+- val data is never passed to fit() — only X_train + y_train.
+- transform() is called once each on X_train and X_val with no `y`.
+- Model code receives only ndarrays — no DataFrame metadata to peek at.
 """
 
 from __future__ import annotations
 
-import importlib.util
 import sys
 from pathlib import Path
 
-
-def load_module(path: str):
-    spec = importlib.util.spec_from_file_location("train_under_test", path)
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    return mod
+import pandas as pd
 
 
 def main() -> int:
-    train = load_module(str(Path(__file__).parent / "train.py"))
-    auc = train.run_pipeline()
+    here = Path(__file__).resolve().parent
+    train_df = pd.read_parquet(here / "data" / "base_train_small.parquet")
+    val_df = pd.read_parquet(here / "data" / "base_val_small.parquet")
+
+    y_train = train_df["isFraud"].values.astype("int32")
+    y_val = val_df["isFraud"].values.astype("int32")
+
+    # Strip target and ID before either file's code can see them.
+    X_train = train_df.drop(columns=["isFraud", "TransactionID"])
+    X_val = val_df.drop(columns=["isFraud", "TransactionID"])
+
+    # Import here so that any syntax error in features.py / model.py surfaces
+    # as a real error, not a silent module-cache hit.
+    sys.path.insert(0, str(here))
+    from features import FeatureBuilder
+    from model import train_and_evaluate
+
+    fb = FeatureBuilder().fit(X_train, y_train)
+    X_train_t = fb.transform(X_train)
+    X_val_t = fb.transform(X_val)
+
+    if X_train_t.shape[1] != X_val_t.shape[1]:
+        print(
+            f"Constraint violated: train and val transform produced different "
+            f"feature counts ({X_train_t.shape[1]} vs {X_val_t.shape[1]})"
+        )
+        return 1
 
+    auc = train_and_evaluate(X_train_t, y_train, X_val_t, y_val)
     if not (0.0 <= auc <= 1.0):
-        print(f"Constraint violated: AUC-ROC out of range ({auc})")
+        print(f"Constraint violated: AUC out of range ({auc})")
         return 1
 
     print(f"auc_roc: {auc:.6f}")
diff --git a/examples/fraud-detection/features.py b/examples/fraud-detection/features.py
new file mode 100644
index 0000000..3268ded
--- /dev/null
+++ b/examples/fraud-detection/features.py
@@ -0,0 +1,104 @@
+"""Feature engineering — Weco optimizes this file (and only this file) for the
+Features-only scope. fit() sees only train; transform() sees only X.
+
+Interface contract enforced by evaluate.py:
+- isFraud and TransactionID are stripped before X reaches FeatureBuilder.
+- val data is never visible during fit() — time leakage is impossible.
+- transform() has no `y` argument — val labels can't influence val features.
+
+What you CAN do here:
+- Fit frequency, target, and group encoders on (X_train, y_train) inside fit().
+- Use K-fold OOF protection if you want target encoding *within* train.
+- Construct UIDs (e.g. card1+addr1+account-creation-day proxy) and aggregate.
+- Stash any state in `self.*` so transform() can apply it deterministically.
+
+What you CANNOT do:
+- Concatenate train+val (val is not in scope).
+- Branch on y in transform() (it's not an argument).
+- Recompute encoders during transform — only look up self.* state.
+
+Output: a numpy float32 array. transform must produce the same n_features and
+the same column order on both train and val.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+
+
+class FeatureBuilder:
+    def __init__(self) -> None:
+        # State populated by fit, read by transform.
+        self.freq_: dict[str, dict] = {}
+        self.amt_stats_: dict[str, dict[str, dict]] = {}
+        self.train_amt_mean_: float = 0.0
+        self.train_amt_std_: float = 0.0
+
+    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> "FeatureBuilder":
+        # Frequency encoders fit on train values only.
+        for col in ("card1", "card2", "card5", "addr1"):
+            if col in X_train.columns:
+                self.freq_[col] = X_train[col].value_counts(normalize=True).to_dict()
+
+        # Group amount aggregations fit on train rows only.
+        for key in ("card1", "addr1"):
+            if key not in X_train.columns:
+                continue
+            grp = X_train.groupby(key)["TransactionAmt"]
+            stats = grp.agg(["mean", "std", "count"]).fillna(0)
+            self.amt_stats_[key] = {
+                "mean": stats["mean"].to_dict(),
+                "std": stats["std"].to_dict(),
+                "count": stats["count"].to_dict(),
+            }
+
+        # Train-global defaults for unseen keys at transform time.
+        self.train_amt_mean_ = float(X_train["TransactionAmt"].mean())
+        self.train_amt_std_ = float(X_train["TransactionAmt"].std())
+        return self
+
+    def transform(self, X: pd.DataFrame) -> np.ndarray:
+        """Apply self.* state to X. Called once each on X_train and X_val."""
+        out = pd.DataFrame(index=X.index)
+
+        # Per-row time features (no cross-row dependency).
+        out["hour"] = (X["TransactionDT"] // 3600) % 24
+        out["day_of_week"] = (X["TransactionDT"] // 86400) % 7
+        out["hour_sin"] = np.sin(2 * np.pi * out["hour"] / 24)
+        out["hour_cos"] = np.cos(2 * np.pi * out["hour"] / 24)
+
+        # Per-row amount features.
+        out["TransactionAmt"] = X["TransactionAmt"].astype(np.float32)
+        out["TransactionAmt_log"] = np.log1p(X["TransactionAmt"])
+        out["TransactionAmt_decimal"] = (
+            X["TransactionAmt"] - X["TransactionAmt"].astype(int)
+        ).round(2)
+        out["TransactionAmt_is_round"] = (out["TransactionAmt_decimal"] == 0).astype(np.int8)
+
+        # Frequency lookups (unseen keys → 0).
+        for col in ("card1", "card2", "card5", "addr1"):
+            if col in X.columns and col in self.freq_:
+                out[f"{col}_freq"] = X[col].map(self.freq_[col]).fillna(0)
+
+        # Group amount aggregations (unseen keys → train-global default).
+        for key in ("card1", "addr1"):
+            if key in X.columns and key in self.amt_stats_:
+                s = self.amt_stats_[key]
+                out[f"{key}_amt_mean"] = X[key].map(s["mean"]).fillna(self.train_amt_mean_)
+                out[f"{key}_amt_std"] = X[key].map(s["std"]).fillna(self.train_amt_std_)
+                out[f"{key}_amt_count"] = X[key].map(s["count"]).fillna(0)
+
+        # Pass-through every remaining numeric column.
+        for col in X.columns:
+            if col == "TransactionDT":
+                continue
+            if col in out.columns:
+                continue
+            if pd.api.types.is_numeric_dtype(X[col]):
+                out[col] = X[col].values
+
+        return out.values.astype(np.float32)
+
+    def fit_transform(self, X_train: pd.DataFrame, y_train: pd.Series) -> np.ndarray:
+        return self.fit(X_train, y_train).transform(X_train)
diff --git a/examples/fraud-detection/model.py b/examples/fraud-detection/model.py
new file mode 100644
index 0000000..6bfbbd9
--- /dev/null
+++ b/examples/fraud-detection/model.py
@@ -0,0 +1,69 @@
+"""Model training and evaluation — Weco optimizes this file for the Model
+scope. Features arrive pre-built; labels arrive separately.
+
+Interface:
+- train_and_evaluate(X_train, y_train, X_val, y_val) -> float (val AUC)
+- X_* are float32 ndarrays of identical shape; y_* are int32 arrays of labels.
+- Return validation AUC-ROC. Print a final `auc_roc: 0.xxxxxx` line in
+  evaluate.py (this file just returns the float).
+
+What you CAN do:
+- Tune LightGBM hyperparameters, boosting strategy, num_iterations, etc.
+- Switch model class (xgboost, catboost, sklearn ensemble, custom torch model).
+- Build ensembles, stacking, blending.
+- Modify class-imbalance handling, custom objectives.
+
+What you CANNOT do:
+- See the feature column names (already projected to ndarray).
+- Re-engineer features here — features.py owns that scope.
+- Peek at val labels at training time (they're a separate argument; use them
+  only inside the AUC computation at the end).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import lightgbm as lgb
+from sklearn.metrics import roc_auc_score
+
+
+def train_and_evaluate(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_val: np.ndarray,
+    y_val: np.ndarray,
+) -> float:
+    """Train a model on (X_train, y_train); return AUC-ROC on (X_val, y_val).
+
+    Reasonable-but-not-heavily-tuned LightGBM defaults. There is real headroom
+    here — class imbalance, regularization, deeper trees, more rounds, ensembles.
+    """
+    params = {
+        "objective": "binary",
+        "metric": "auc",
+        "boosting_type": "gbdt",
+        "learning_rate": 0.05,
+        "num_leaves": 127,
+        "max_depth": -1,
+        "min_child_samples": 50,
+        "subsample": 0.8,
+        "colsample_bytree": 0.8,
+        "reg_alpha": 0.1,
+        "reg_lambda": 1.0,
+        "scale_pos_weight": 1,
+        "n_jobs": 4,
+        "verbose": -1,
+        "seed": 42,
+    }
+    train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
+    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, free_raw_data=False)
+    model = lgb.train(
+        params,
+        train_data,
+        num_boost_round=1000,
+        valid_sets=[val_data],
+        valid_names=["val"],
+        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)],
+    )
+    y_pred = model.predict(X_val)
+    return float(roc_auc_score(y_val, y_pred))

From b1aaa6ca326e80276fba08205ba1c3945cc585b9 Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Wed, 29 Apr 2026 13:19:23 +0100
Subject: [PATCH 7/8] examples/fraud-detection: pass y_train/y_val as
 pd.Series, not ndarray

The features.py docstring says y_train is a pd.Series (so users can call
.values, .map, .to_dict on it for OOF target encoding). Earlier evaluate.py
passed the result of .values.astype("int32") which is a numpy ndarray,
breaking any proposal that did `y_train.values` or `y_train.map(...)`.

Sanity-checked on a 3-seed Weco run: with the Series fix, proposals proceed
past step 1 instead of crashing on AttributeError.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/fraud-detection/evaluate.py | 13 ++++++++++---
 examples/fraud-detection/model.py    |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/fraud-detection/evaluate.py b/examples/fraud-detection/evaluate.py
index 0a51825..8196b4b 100644
--- a/examples/fraud-detection/evaluate.py
+++ b/examples/fraud-detection/evaluate.py
@@ -19,12 +19,19 @@
 
 
 def main() -> int:
-    here = Path(__file__).resolve().parent
+    # IMPORTANT: use os.getcwd() (the process cwd Weco invoked us in), NOT
+    # Path(__file__).resolve() which follows symlinks back to the template
+    # directory. Weco lays down per-seed copies of features.py / model.py and
+    # invokes the evaluator with cwd = the per-seed dir; that's the
+    # authoritative anchor for finding the proposed code.
+    import os
+    here = Path(os.getcwd())
     train_df = pd.read_parquet(here / "data" / "base_train_small.parquet")
     val_df = pd.read_parquet(here / "data" / "base_val_small.parquet")
 
-    y_train = train_df["isFraud"].values.astype("int32")
-    y_val = val_df["isFraud"].values.astype("int32")
+    # Pass y as a pd.Series so .values, .map, .to_dict, etc. all work.
+    y_train = train_df["isFraud"].astype("int32")
+    y_val = val_df["isFraud"].astype("int32")
 
     # Strip target and ID before either file's code can see them.
     X_train = train_df.drop(columns=["isFraud", "TransactionID"])
diff --git a/examples/fraud-detection/model.py b/examples/fraud-detection/model.py
index 6bfbbd9..a1c7b7c 100644
--- a/examples/fraud-detection/model.py
+++ b/examples/fraud-detection/model.py
@@ -3,7 +3,7 @@
 
 Interface:
 - train_and_evaluate(X_train, y_train, X_val, y_val) -> float (val AUC)
-- X_* are float32 ndarrays of identical shape; y_* are int32 arrays of labels.
+- X_* are float32 ndarrays of identical shape; y_* are pd.Series of int32 labels.
 - Return validation AUC-ROC. Print a final `auc_roc: 0.xxxxxx` line in
   evaluate.py (this file just returns the float).
 

From 2efbe7f1934e237031b8362920acba307a5efb81 Mon Sep 17 00:00:00 2001
From: Zhengyao Jiang <jzyjiangzhengyao@gmail.com>
Date: Mon, 8 Jun 2026 15:36:38 +0100
Subject: [PATCH 8/8] examples/fraud-detection: apply ruff format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolves the failing lint CI check. `ruff format --check .` flagged 6 files
in the new fraud-detection examples (collapse multi-line signatures/calls that
fit on one line). No functional change — whitespace only.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../fraud-detection-loose/prepare_data.py     | 28 ++++++++-----------
 examples/fraud-detection-loose/train.py       | 19 +++----------
 examples/fraud-detection/evaluate.py          |  1 +
 examples/fraud-detection/features.py          |  4 +--
 examples/fraud-detection/model.py             |  7 +----
 examples/fraud-detection/prepare_data.py      | 28 ++++++++-----------
 6 files changed, 29 insertions(+), 58 deletions(-)

diff --git a/examples/fraud-detection-loose/prepare_data.py b/examples/fraud-detection-loose/prepare_data.py
index b8da66e..d2edda5 100644
--- a/examples/fraud-detection-loose/prepare_data.py
+++ b/examples/fraud-detection-loose/prepare_data.py
@@ -66,8 +66,7 @@ def download_kaggle() -> None:
     # `python -m kaggle` fails. kaggle.cli is the canonical entry point.
     try:
         subprocess.check_call(
-            [sys.executable, "-m", "kaggle.cli", "competitions", "download",
-             "-c", "ieee-fraud-detection", "-p", str(DATA_DIR)]
+            [sys.executable, "-m", "kaggle.cli", "competitions", "download", "-c", "ieee-fraud-detection", "-p", str(DATA_DIR)]
         )
     except subprocess.CalledProcessError as e:
         print(
@@ -108,9 +107,7 @@ def reduce_v_features(
     return train.drop(columns=to_drop), val.drop(columns=to_drop), to_drop
 
 
-def label_encode_with_combined_categories(
-    train: pd.DataFrame, val: pd.DataFrame
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def label_encode_with_combined_categories(train: pd.DataFrame, val: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Encode all object/string cols using categories from concat(train, val).
 
     Important: include both "object" AND "string" — pandas 3 strings have
@@ -126,9 +123,7 @@ def label_encode_with_combined_categories(
     return train, val
 
 
-def stratified_subsample(
-    train: pd.DataFrame, val: pd.DataFrame
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def stratified_subsample(train: pd.DataFrame, val: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Stratified train subsample preserving fraud rate; uniform val subsample.
 
     Uses ONE global `np.random.seed(42)` then sequential `np.random.choice`
@@ -142,15 +137,12 @@ def stratified_subsample(
     n_fraud = int(TRAIN_SIZE * fraud_rate)
     n_legit = TRAIN_SIZE - n_fraud
     si = np.sort(
-        np.concatenate([
-            np.random.choice(fraud_idx, n_fraud, replace=False),
-            np.random.choice(legit_idx, n_legit, replace=False),
-        ])
+        np.concatenate(
+            [np.random.choice(fraud_idx, n_fraud, replace=False), np.random.choice(legit_idx, n_legit, replace=False)]
+        )
     )
     train_small = train.loc[si].reset_index(drop=True)
-    val_small = val.iloc[
-        np.random.choice(len(val), VAL_SIZE, replace=False)
-    ].reset_index(drop=True)
+    val_small = val.iloc[np.random.choice(len(val), VAL_SIZE, replace=False)].reset_index(drop=True)
     return train_small, val_small
 
 
@@ -182,8 +174,10 @@ def main() -> None:
 
     print("[subsample] stratified train, uniform val (np.random.seed=42)")
     train_small, val_small = stratified_subsample(train, val)
-    print(f"[subsample] train={len(train_small)} (fraud {train_small['isFraud'].mean():.3%}), "
-          f"val={len(val_small)} (fraud {val_small['isFraud'].mean():.3%})")
+    print(
+        f"[subsample] train={len(train_small)} (fraud {train_small['isFraud'].mean():.3%}), "
+        f"val={len(val_small)} (fraud {val_small['isFraud'].mean():.3%})"
+    )
 
     train_small.to_parquet(train_out, index=False)
     val_small.to_parquet(val_out, index=False)
diff --git a/examples/fraud-detection-loose/train.py b/examples/fraud-detection-loose/train.py
index 10e1c46..ac28323 100644
--- a/examples/fraud-detection-loose/train.py
+++ b/examples/fraud-detection-loose/train.py
@@ -24,16 +24,12 @@ def _add_row_features(df: pd.DataFrame) -> pd.DataFrame:
     df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
 
     df["TransactionAmt_log"] = np.log1p(df["TransactionAmt"])
-    df["TransactionAmt_decimal"] = (
-        df["TransactionAmt"] - df["TransactionAmt"].astype(int)
-    ).round(2)
+    df["TransactionAmt_decimal"] = (df["TransactionAmt"] - df["TransactionAmt"].astype(int)).round(2)
     df["TransactionAmt_is_round"] = (df["TransactionAmt_decimal"] == 0).astype(np.int8)
     return df
 
 
-def build_features(
-    train_df: pd.DataFrame, val_df: pd.DataFrame
-) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+def build_features(train_df: pd.DataFrame, val_df: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
     """Build features from the base data. Returns (X_train, y_train, X_val, y_val).
 
     Any aggregation or encoding (groupby stats, frequency, target encoding, ...)
@@ -63,9 +59,7 @@ def build_features(
     for key in ["card1", "addr1"]:
         grp = train.groupby(key)["TransactionAmt"]
         stats = grp.agg(["mean", "std", "count"]).rename(
-            columns={"mean": f"{key}_amt_mean",
-                     "std": f"{key}_amt_std",
-                     "count": f"{key}_amt_count"}
+            columns={"mean": f"{key}_amt_mean", "std": f"{key}_amt_std", "count": f"{key}_amt_count"}
         )
         # Unseen keys in val: fall back to train-global mean/std and count=0.
         defaults = {
@@ -97,12 +91,7 @@ def build_features(
     return X_train, y_train, X_val, y_val
 
 
-def train_and_evaluate(
-    X_train: np.ndarray,
-    y_train: np.ndarray,
-    X_val: np.ndarray,
-    y_val: np.ndarray,
-) -> float:
+def train_and_evaluate(X_train: np.ndarray, y_train: np.ndarray, X_val: np.ndarray, y_val: np.ndarray) -> float:
     """Train LightGBM and return AUC-ROC on the validation set.
 
     Reasonable-but-not-heavily-tuned hyperparameters. A fraud team would
diff --git a/examples/fraud-detection/evaluate.py b/examples/fraud-detection/evaluate.py
index 8196b4b..fce3a73 100644
--- a/examples/fraud-detection/evaluate.py
+++ b/examples/fraud-detection/evaluate.py
@@ -25,6 +25,7 @@ def main() -> int:
     # invokes the evaluator with cwd = the per-seed dir; that's the
     # authoritative anchor for finding the proposed code.
     import os
+
     here = Path(os.getcwd())
     train_df = pd.read_parquet(here / "data" / "base_train_small.parquet")
     val_df = pd.read_parquet(here / "data" / "base_val_small.parquet")
diff --git a/examples/fraud-detection/features.py b/examples/fraud-detection/features.py
index 3268ded..eb8578f 100644
--- a/examples/fraud-detection/features.py
+++ b/examples/fraud-detection/features.py
@@ -71,9 +71,7 @@ def transform(self, X: pd.DataFrame) -> np.ndarray:
         # Per-row amount features.
         out["TransactionAmt"] = X["TransactionAmt"].astype(np.float32)
         out["TransactionAmt_log"] = np.log1p(X["TransactionAmt"])
-        out["TransactionAmt_decimal"] = (
-            X["TransactionAmt"] - X["TransactionAmt"].astype(int)
-        ).round(2)
+        out["TransactionAmt_decimal"] = (X["TransactionAmt"] - X["TransactionAmt"].astype(int)).round(2)
         out["TransactionAmt_is_round"] = (out["TransactionAmt_decimal"] == 0).astype(np.int8)
 
         # Frequency lookups (unseen keys → 0).
diff --git a/examples/fraud-detection/model.py b/examples/fraud-detection/model.py
index a1c7b7c..d571beb 100644
--- a/examples/fraud-detection/model.py
+++ b/examples/fraud-detection/model.py
@@ -27,12 +27,7 @@
 from sklearn.metrics import roc_auc_score
 
 
-def train_and_evaluate(
-    X_train: np.ndarray,
-    y_train: np.ndarray,
-    X_val: np.ndarray,
-    y_val: np.ndarray,
-) -> float:
+def train_and_evaluate(X_train: np.ndarray, y_train: np.ndarray, X_val: np.ndarray, y_val: np.ndarray) -> float:
     """Train a model on (X_train, y_train); return AUC-ROC on (X_val, y_val).
 
     Reasonable-but-not-heavily-tuned LightGBM defaults. There is real headroom
diff --git a/examples/fraud-detection/prepare_data.py b/examples/fraud-detection/prepare_data.py
index b8da66e..d2edda5 100644
--- a/examples/fraud-detection/prepare_data.py
+++ b/examples/fraud-detection/prepare_data.py
@@ -66,8 +66,7 @@ def download_kaggle() -> None:
     # `python -m kaggle` fails. kaggle.cli is the canonical entry point.
     try:
         subprocess.check_call(
-            [sys.executable, "-m", "kaggle.cli", "competitions", "download",
-             "-c", "ieee-fraud-detection", "-p", str(DATA_DIR)]
+            [sys.executable, "-m", "kaggle.cli", "competitions", "download", "-c", "ieee-fraud-detection", "-p", str(DATA_DIR)]
         )
     except subprocess.CalledProcessError as e:
         print(
@@ -108,9 +107,7 @@ def reduce_v_features(
     return train.drop(columns=to_drop), val.drop(columns=to_drop), to_drop
 
 
-def label_encode_with_combined_categories(
-    train: pd.DataFrame, val: pd.DataFrame
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def label_encode_with_combined_categories(train: pd.DataFrame, val: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Encode all object/string cols using categories from concat(train, val).
 
     Important: include both "object" AND "string" — pandas 3 strings have
@@ -126,9 +123,7 @@ def label_encode_with_combined_categories(
     return train, val
 
 
-def stratified_subsample(
-    train: pd.DataFrame, val: pd.DataFrame
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def stratified_subsample(train: pd.DataFrame, val: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Stratified train subsample preserving fraud rate; uniform val subsample.
 
     Uses ONE global `np.random.seed(42)` then sequential `np.random.choice`
@@ -142,15 +137,12 @@ def stratified_subsample(
     n_fraud = int(TRAIN_SIZE * fraud_rate)
     n_legit = TRAIN_SIZE - n_fraud
     si = np.sort(
-        np.concatenate([
-            np.random.choice(fraud_idx, n_fraud, replace=False),
-            np.random.choice(legit_idx, n_legit, replace=False),
-        ])
+        np.concatenate(
+            [np.random.choice(fraud_idx, n_fraud, replace=False), np.random.choice(legit_idx, n_legit, replace=False)]
+        )
     )
     train_small = train.loc[si].reset_index(drop=True)
-    val_small = val.iloc[
-        np.random.choice(len(val), VAL_SIZE, replace=False)
-    ].reset_index(drop=True)
+    val_small = val.iloc[np.random.choice(len(val), VAL_SIZE, replace=False)].reset_index(drop=True)
     return train_small, val_small
 
 
@@ -182,8 +174,10 @@ def main() -> None:
 
     print("[subsample] stratified train, uniform val (np.random.seed=42)")
     train_small, val_small = stratified_subsample(train, val)
-    print(f"[subsample] train={len(train_small)} (fraud {train_small['isFraud'].mean():.3%}), "
-          f"val={len(val_small)} (fraud {val_small['isFraud'].mean():.3%})")
+    print(
+        f"[subsample] train={len(train_small)} (fraud {train_small['isFraud'].mean():.3%}), "
+        f"val={len(val_small)} (fraud {val_small['isFraud'].mean():.3%})"
+    )
 
     train_small.to_parquet(train_out, index=False)
     val_small.to_parquet(val_out, index=False)