disable target_mean_encoding for now - due to separate DFs for features/labels

jimdowling · jimdowling · commit 50b7d70cc771 · 2025-12-03T16:54:25.000+01:00
diff --git a/python/hsfs/BUILTIN_TRANSFORMATIONS.md b/python/hsfs/BUILTIN_TRANSFORMATIONS.md
@@ -36,6 +36,7 @@ The HSFS library includes several ready-to-use transformation functions in `hsfs
 
 - **`label_encoder(feature)`**: Encode categorical values as integers based on sorted unique values from training. Unseen categories → -1.
 - **`one_hot_encoder(feature)`**: One-hot encode categorical features. Unseen categories → all False.
+<!-- 
 - **`target_mean_encoder(feature, label)`**: Replace categories with the mean of the target variable.
   - **Training**: Computes per-category target means from `feature` and `label` Series.
   - **Serving**: Use a precomputed mapping via transformation context:
@@ -50,6 +51,7 @@ The HSFS library includes several ready-to-use transformation functions in `hsfs
     ```
   - Unseen categories fall back to `global_mean` if provided, otherwise NaN.
   - Only the feature column is dropped; the label column is preserved.
+-->
 
 ## Usage
 
diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
@@ -378,67 +378,67 @@ def winsorize(
     return pd.Series(clipped, index=feature.index)
 
 
-@udf(float, drop=["feature"], mode="pandas")
-def target_mean_encoder(
-    feature: pd.Series,
-    label: pd.Series,
-    statistics=feature_statistics,
-    context: dict | None = None,
-) -> pd.Series:
-    """
-    Target / Mean Encoding for categorical features.
-
-    Replaces each category in `feature` with the mean of the target variable `label`.
-
-    Usage notes:
-    - During training (offline): provide both `feature` and `label`; the encoder computes
-      the per-category mean on-the-fly from these two Series.
-    - During serving/online or when labels are unavailable: provide a precomputed mapping via
-      the transformation context as `{"target_means": {category: mean, ...}, "global_mean": <float>}`.
-      Unseen categories fall back to `global_mean` when provided, otherwise NaN.
-    - Only the input `feature` column is dropped. The `label` column is preserved in outputs.
-
-    Edge cases:
-    - If `label` is entirely null or not provided (e.g., serving), a context mapping is required.
-    - If `feature` contains NaN, the encoded value will be NaN for those rows.
-    """
-
-    # Ensure pandas Series with appropriate dtype
-    f = feature
-    y = label if label is not None else None
-
-    mapping: dict | None = None
-    global_mean: float | None = None
-
-    if isinstance(context, dict):
-        mapping = context.get("target_means") or context.get("mapping")
-        global_mean = context.get("global_mean")
-
-    # Training/offline path: compute mapping from data if label provided and non-empty
-    if y is not None and not (isinstance(y, pd.Series) and y.isna().all()):
-        # Attempt numeric conversion for label; errors='coerce' will turn non-numeric into NaN
-        y_num = pd.to_numeric(y, errors="coerce")
-        # Compute category -> mean(label)
-        df = pd.DataFrame({"__cat__": f, "__y__": y_num})
-        means = df.groupby("__cat__")["__y__"].mean()
-        mapping = means.to_dict()
-        # Global mean for fallback on unseen categories at serve-time
-        global_mean = float(y_num.mean()) if not pd.isna(y_num.mean()) else None
-
-    if mapping is None:
-        # No mapping available: try to use just global mean for all known categories
-        if global_mean is not None:
-            return pd.Series(
-                [global_mean if pd.notna(v) else np.nan for v in f], index=f.index
-            )
-        # As a last resort, return NaNs (cannot encode)
-        return pd.Series([np.nan for _ in f], index=f.index)
-
-    # Map categories to target means; unseen -> global_mean (if provided) else NaN
-    def _map_val(v):
-        if pd.isna(v):
-            return np.nan
-        return mapping.get(v, global_mean)
-
-    encoded = f.map(_map_val)
-    return pd.Series(encoded, index=f.index)
+# @udf(float, drop=["feature"], mode="pandas")
+# def target_mean_encoder(
+#     feature: pd.Series,
+#     label: pd.Series,
+#     statistics=feature_statistics,
+#     context: dict | None = None,
+# ) -> pd.Series:
+#     """
+#     Target / Mean Encoding for categorical features.
+
+#     Replaces each category in `feature` with the mean of the target variable `label`.
+
+#     Usage notes:
+#     - During training (offline): provide both `feature` and `label`; the encoder computes
+#       the per-category mean on-the-fly from these two Series.
+#     - During serving/online or when labels are unavailable: provide a precomputed mapping via
+#       the transformation context as `{"target_means": {category: mean, ...}, "global_mean": <float>}`.
+#       Unseen categories fall back to `global_mean` when provided, otherwise NaN.
+#     - Only the input `feature` column is dropped. The `label` column is preserved in outputs.
+
+#     Edge cases:
+#     - If `label` is entirely null or not provided (e.g., serving), a context mapping is required.
+#     - If `feature` contains NaN, the encoded value will be NaN for those rows.
+#     """
+
+#     # Ensure pandas Series with appropriate dtype
+#     f = feature
+#     y = label if label is not None else None
+
+#     mapping: dict | None = None
+#     global_mean: float | None = None
+
+#     if isinstance(context, dict):
+#         mapping = context.get("target_means") or context.get("mapping")
+#         global_mean = context.get("global_mean")
+
+#     # Training/offline path: compute mapping from data if label provided and non-empty
+#     if y is not None and not (isinstance(y, pd.Series) and y.isna().all()):
+#         # Attempt numeric conversion for label; errors='coerce' will turn non-numeric into NaN
+#         y_num = pd.to_numeric(y, errors="coerce")
+#         # Compute category -> mean(label)
+#         df = pd.DataFrame({"__cat__": f, "__y__": y_num})
+#         means = df.groupby("__cat__")["__y__"].mean()
+#         mapping = means.to_dict()
+#         # Global mean for fallback on unseen categories at serve-time
+#         global_mean = float(y_num.mean()) if not pd.isna(y_num.mean()) else None
+
+#     if mapping is None:
+#         # No mapping available: try to use just global mean for all known categories
+#         if global_mean is not None:
+#             return pd.Series(
+#                 [global_mean if pd.notna(v) else np.nan for v in f], index=f.index
+#             )
+#         # As a last resort, return NaNs (cannot encode)
+#         return pd.Series([np.nan for _ in f], index=f.index)
+
+#     # Map categories to target means; unseen -> global_mean (if provided) else NaN
+#     def _map_val(v):
+#         if pd.isna(v):
+#             return np.nan
+#         return mapping.get(v, global_mean)
+
+#     encoded = f.map(_map_val)
+#     return pd.Series(encoded, index=f.index)
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
@@ -39,7 +39,7 @@ class TransformationFunctionEngine:
         "equal_width_binner",
         "equal_frequency_binner",
         "quantile_binner",
-        "target_mean_encoder",
+        # "target_mean_encoder",
     ]
     AMBIGUOUS_FEATURE_ERROR = (
         "Provided feature '{}' in transformation functions is ambiguous and exists in more than one feature groups."
diff --git a/python/tests/test_builtin_target_mean_encoder.py b/python/tests/test_builtin_target_mean_encoder.py
@@ -1,59 +1,58 @@
-import math
-import numpy as np
-import pytest
-import pandas as pd
-
-from hsfs.builtin_transformations import target_mean_encoder
-
-
-def test_target_mean_encoder_training_computes_means():
-    # Feature and label series
-    feature = pd.Series(["a", "a", "b", "b", "b", "c", None])
-    label = pd.Series([1, 2, 0, 1, 3, 5, 2])
-
-    # Get python callable for the UDF
-    fn = target_mean_encoder.get_udf(online=True)
-    encoded = fn(feature, label)
-
-    # Expected per-category means
-    # a -> (1+2)/2 = 1.5
-    # b -> (0+1+3)/3 = 4/3
-    # c -> (5) = 5
-    expected = [1.5, 1.5, 4.0 / 3.0, 4.0 / 3.0, 4.0 / 3.0, 5.0, math.nan]
-
-    # Compare allowing NaN
-    for got, want in zip(encoded.tolist(), expected):
-        if math.isnan(want):
-            assert math.isnan(got)
-        else:
-            assert got == pytest.approx(want, rel=1e-9, abs=1e-12)
-
-
-def test_target_mean_encoder_serving_with_context_mapping():
-    # Only feature values; label not available at serving, pass all-NaN series
-    feature = pd.Series(["a", "x", None, "b", "c"])
-    dummy_label = pd.Series([np.nan] * len(feature))
-
-    # Provide mapping via transformation context
-    mapping = {"a": 1.5, "b": 1.25}
-    global_mean = 2.0
-    target_mean_encoder.transformation_context = {
-        "target_means": mapping,
-        "global_mean": global_mean,
-    }
-
-    fn = target_mean_encoder.get_udf(online=True)
-    encoded = fn(feature, dummy_label)
-
-    # a -> 1.5 (in mapping)
-    # x -> unseen -> fallback to global_mean (2.0)
-    # None -> NaN
-    # b -> 1.25 (in mapping)
-    # c -> unseen -> fallback to global_mean (2.0)
-    expected = [1.5, 2.0, math.nan, 1.25, 2.0]
-
-    for got, want in zip(encoded.tolist(), expected):
-        if math.isnan(want):
-            assert math.isnan(got)
-        else:
-            assert got == pytest.approx(want, rel=1e-9, abs=1e-12)
+# from hsfs.builtin_transformations import target_mean_encoder
+
+
+def test_target_mean_encoder_empty():
+    assert True
+
+
+# def test_target_mean_encoder_training_computes_means():
+#     # Feature and label series
+#     feature = pd.Series(["a", "a", "b", "b", "b", "c", None])
+#     label = pd.Series([1, 2, 0, 1, 3, 5, 2])
+
+#     # Get python callable for the UDF
+#     fn = target_mean_encoder.get_udf(online=True)
+#     encoded = fn(feature, label)
+
+#     # Expected per-category means
+#     # a -> (1+2)/2 = 1.5
+#     # b -> (0+1+3)/3 = 4/3
+#     # c -> (5) = 5
+#     expected = [1.5, 1.5, 4.0 / 3.0, 4.0 / 3.0, 4.0 / 3.0, 5.0, math.nan]
+
+#     # Compare allowing NaN
+#     for got, want in zip(encoded.tolist(), expected):
+#         if math.isnan(want):
+#             assert math.isnan(got)
+#         else:
+#             assert got == pytest.approx(want, rel=1e-9, abs=1e-12)
+
+
+# def test_target_mean_encoder_serving_with_context_mapping():
+#     # Only feature values; label not available at serving, pass all-NaN series
+#     feature = pd.Series(["a", "x", None, "b", "c"])
+#     dummy_label = pd.Series([np.nan] * len(feature))
+
+#     # Provide mapping via transformation context
+#     mapping = {"a": 1.5, "b": 1.25}
+#     global_mean = 2.0
+#     target_mean_encoder.transformation_context = {
+#         "target_means": mapping,
+#         "global_mean": global_mean,
+#     }
+
+#     fn = target_mean_encoder.get_udf(online=True)
+#     encoded = fn(feature, dummy_label)
+
+#     # a -> 1.5 (in mapping)
+#     # x -> unseen -> fallback to global_mean (2.0)
+#     # None -> NaN
+#     # b -> 1.25 (in mapping)
+#     # c -> unseen -> fallback to global_mean (2.0)
+#     expected = [1.5, 2.0, math.nan, 1.25, 2.0]
+
+#     for got, want in zip(encoded.tolist(), expected):
+#         if math.isnan(want):
+#             assert math.isnan(got)
+#         else:
+#             assert got == pytest.approx(want, rel=1e-9, abs=1e-12)

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ class TransformationFunctionEngine:`
`39`	`39`	`"equal_width_binner",`
`40`	`40`	`"equal_frequency_binner",`
`41`	`41`	`"quantile_binner",`
`42`		`- "target_mean_encoder",`
	`42`	`+ # "target_mean_encoder",`
`43`	`43`	`]`
`44`	`44`	`AMBIGUOUS_FEATURE_ERROR = (`
`45`	`45`	`"Provided feature '{}' in transformation functions is ambiguous and exists in more than one feature groups."`