Skip to content

Commit 50b7d70

Browse files
committed
disable target_mean_encoding for now - due to separate DFs for features/labels
1 parent 6ae538e commit 50b7d70

File tree

4 files changed

+125
-124
lines changed

4 files changed

+125
-124
lines changed

python/hsfs/BUILTIN_TRANSFORMATIONS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ The HSFS library includes several ready-to-use transformation functions in `hsfs
3636

3737
- **`label_encoder(feature)`**: Encode categorical values as integers based on sorted unique values from training. Unseen categories → -1.
3838
- **`one_hot_encoder(feature)`**: One-hot encode categorical features. Unseen categories → all False.
39+
<!--
3940
- **`target_mean_encoder(feature, label)`**: Replace categories with the mean of the target variable.
4041
- **Training**: Computes per-category target means from `feature` and `label` Series.
4142
- **Serving**: Use a precomputed mapping via transformation context:
@@ -50,6 +51,7 @@ The HSFS library includes several ready-to-use transformation functions in `hsfs
5051
```
5152
- Unseen categories fall back to `global_mean` if provided, otherwise NaN.
5253
- Only the feature column is dropped; the label column is preserved.
54+
-->
5355

5456
## Usage
5557

python/hsfs/builtin_transformations.py

Lines changed: 64 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -378,67 +378,67 @@ def winsorize(
378378
return pd.Series(clipped, index=feature.index)
379379

380380

381-
@udf(float, drop=["feature"], mode="pandas")
382-
def target_mean_encoder(
383-
feature: pd.Series,
384-
label: pd.Series,
385-
statistics=feature_statistics,
386-
context: dict | None = None,
387-
) -> pd.Series:
388-
"""
389-
Target / Mean Encoding for categorical features.
390-
391-
Replaces each category in `feature` with the mean of the target variable `label`.
392-
393-
Usage notes:
394-
- During training (offline): provide both `feature` and `label`; the encoder computes
395-
the per-category mean on-the-fly from these two Series.
396-
- During serving/online or when labels are unavailable: provide a precomputed mapping via
397-
the transformation context as `{"target_means": {category: mean, ...}, "global_mean": <float>}`.
398-
Unseen categories fall back to `global_mean` when provided, otherwise NaN.
399-
- Only the input `feature` column is dropped. The `label` column is preserved in outputs.
400-
401-
Edge cases:
402-
- If `label` is entirely null or not provided (e.g., serving), a context mapping is required.
403-
- If `feature` contains NaN, the encoded value will be NaN for those rows.
404-
"""
405-
406-
# Ensure pandas Series with appropriate dtype
407-
f = feature
408-
y = label if label is not None else None
409-
410-
mapping: dict | None = None
411-
global_mean: float | None = None
412-
413-
if isinstance(context, dict):
414-
mapping = context.get("target_means") or context.get("mapping")
415-
global_mean = context.get("global_mean")
416-
417-
# Training/offline path: compute mapping from data if label provided and non-empty
418-
if y is not None and not (isinstance(y, pd.Series) and y.isna().all()):
419-
# Attempt numeric conversion for label; errors='coerce' will turn non-numeric into NaN
420-
y_num = pd.to_numeric(y, errors="coerce")
421-
# Compute category -> mean(label)
422-
df = pd.DataFrame({"__cat__": f, "__y__": y_num})
423-
means = df.groupby("__cat__")["__y__"].mean()
424-
mapping = means.to_dict()
425-
# Global mean for fallback on unseen categories at serve-time
426-
global_mean = float(y_num.mean()) if not pd.isna(y_num.mean()) else None
427-
428-
if mapping is None:
429-
# No mapping available: try to use just global mean for all known categories
430-
if global_mean is not None:
431-
return pd.Series(
432-
[global_mean if pd.notna(v) else np.nan for v in f], index=f.index
433-
)
434-
# As a last resort, return NaNs (cannot encode)
435-
return pd.Series([np.nan for _ in f], index=f.index)
436-
437-
# Map categories to target means; unseen -> global_mean (if provided) else NaN
438-
def _map_val(v):
439-
if pd.isna(v):
440-
return np.nan
441-
return mapping.get(v, global_mean)
442-
443-
encoded = f.map(_map_val)
444-
return pd.Series(encoded, index=f.index)
381+
# @udf(float, drop=["feature"], mode="pandas")
382+
# def target_mean_encoder(
383+
# feature: pd.Series,
384+
# label: pd.Series,
385+
# statistics=feature_statistics,
386+
# context: dict | None = None,
387+
# ) -> pd.Series:
388+
# """
389+
# Target / Mean Encoding for categorical features.
390+
391+
# Replaces each category in `feature` with the mean of the target variable `label`.
392+
393+
# Usage notes:
394+
# - During training (offline): provide both `feature` and `label`; the encoder computes
395+
# the per-category mean on-the-fly from these two Series.
396+
# - During serving/online or when labels are unavailable: provide a precomputed mapping via
397+
# the transformation context as `{"target_means": {category: mean, ...}, "global_mean": <float>}`.
398+
# Unseen categories fall back to `global_mean` when provided, otherwise NaN.
399+
# - Only the input `feature` column is dropped. The `label` column is preserved in outputs.
400+
401+
# Edge cases:
402+
# - If `label` is entirely null or not provided (e.g., serving), a context mapping is required.
403+
# - If `feature` contains NaN, the encoded value will be NaN for those rows.
404+
# """
405+
406+
# # Ensure pandas Series with appropriate dtype
407+
# f = feature
408+
# y = label if label is not None else None
409+
410+
# mapping: dict | None = None
411+
# global_mean: float | None = None
412+
413+
# if isinstance(context, dict):
414+
# mapping = context.get("target_means") or context.get("mapping")
415+
# global_mean = context.get("global_mean")
416+
417+
# # Training/offline path: compute mapping from data if label provided and non-empty
418+
# if y is not None and not (isinstance(y, pd.Series) and y.isna().all()):
419+
# # Attempt numeric conversion for label; errors='coerce' will turn non-numeric into NaN
420+
# y_num = pd.to_numeric(y, errors="coerce")
421+
# # Compute category -> mean(label)
422+
# df = pd.DataFrame({"__cat__": f, "__y__": y_num})
423+
# means = df.groupby("__cat__")["__y__"].mean()
424+
# mapping = means.to_dict()
425+
# # Global mean for fallback on unseen categories at serve-time
426+
# global_mean = float(y_num.mean()) if not pd.isna(y_num.mean()) else None
427+
428+
# if mapping is None:
429+
# # No mapping available: try to use just global mean for all known categories
430+
# if global_mean is not None:
431+
# return pd.Series(
432+
# [global_mean if pd.notna(v) else np.nan for v in f], index=f.index
433+
# )
434+
# # As a last resort, return NaNs (cannot encode)
435+
# return pd.Series([np.nan for _ in f], index=f.index)
436+
437+
# # Map categories to target means; unseen -> global_mean (if provided) else NaN
438+
# def _map_val(v):
439+
# if pd.isna(v):
440+
# return np.nan
441+
# return mapping.get(v, global_mean)
442+
443+
# encoded = f.map(_map_val)
444+
# return pd.Series(encoded, index=f.index)

python/hsfs/core/transformation_function_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class TransformationFunctionEngine:
3939
"equal_width_binner",
4040
"equal_frequency_binner",
4141
"quantile_binner",
42-
"target_mean_encoder",
42+
# "target_mean_encoder",
4343
]
4444
AMBIGUOUS_FEATURE_ERROR = (
4545
"Provided feature '{}' in transformation functions is ambiguous and exists in more than one feature groups."
Lines changed: 58 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,58 @@
1-
import math
2-
import numpy as np
3-
import pytest
4-
import pandas as pd
5-
6-
from hsfs.builtin_transformations import target_mean_encoder
7-
8-
9-
def test_target_mean_encoder_training_computes_means():
10-
# Feature and label series
11-
feature = pd.Series(["a", "a", "b", "b", "b", "c", None])
12-
label = pd.Series([1, 2, 0, 1, 3, 5, 2])
13-
14-
# Get python callable for the UDF
15-
fn = target_mean_encoder.get_udf(online=True)
16-
encoded = fn(feature, label)
17-
18-
# Expected per-category means
19-
# a -> (1+2)/2 = 1.5
20-
# b -> (0+1+3)/3 = 4/3
21-
# c -> (5) = 5
22-
expected = [1.5, 1.5, 4.0 / 3.0, 4.0 / 3.0, 4.0 / 3.0, 5.0, math.nan]
23-
24-
# Compare allowing NaN
25-
for got, want in zip(encoded.tolist(), expected):
26-
if math.isnan(want):
27-
assert math.isnan(got)
28-
else:
29-
assert got == pytest.approx(want, rel=1e-9, abs=1e-12)
30-
31-
32-
def test_target_mean_encoder_serving_with_context_mapping():
33-
# Only feature values; label not available at serving, pass all-NaN series
34-
feature = pd.Series(["a", "x", None, "b", "c"])
35-
dummy_label = pd.Series([np.nan] * len(feature))
36-
37-
# Provide mapping via transformation context
38-
mapping = {"a": 1.5, "b": 1.25}
39-
global_mean = 2.0
40-
target_mean_encoder.transformation_context = {
41-
"target_means": mapping,
42-
"global_mean": global_mean,
43-
}
44-
45-
fn = target_mean_encoder.get_udf(online=True)
46-
encoded = fn(feature, dummy_label)
47-
48-
# a -> 1.5 (in mapping)
49-
# x -> unseen -> fallback to global_mean (2.0)
50-
# None -> NaN
51-
# b -> 1.25 (in mapping)
52-
# c -> unseen -> fallback to global_mean (2.0)
53-
expected = [1.5, 2.0, math.nan, 1.25, 2.0]
54-
55-
for got, want in zip(encoded.tolist(), expected):
56-
if math.isnan(want):
57-
assert math.isnan(got)
58-
else:
59-
assert got == pytest.approx(want, rel=1e-9, abs=1e-12)
1+
# from hsfs.builtin_transformations import target_mean_encoder
2+
3+
4+
def test_target_mean_encoder_empty():
5+
assert True
6+
7+
8+
# def test_target_mean_encoder_training_computes_means():
9+
# # Feature and label series
10+
# feature = pd.Series(["a", "a", "b", "b", "b", "c", None])
11+
# label = pd.Series([1, 2, 0, 1, 3, 5, 2])
12+
13+
# # Get python callable for the UDF
14+
# fn = target_mean_encoder.get_udf(online=True)
15+
# encoded = fn(feature, label)
16+
17+
# # Expected per-category means
18+
# # a -> (1+2)/2 = 1.5
19+
# # b -> (0+1+3)/3 = 4/3
20+
# # c -> (5) = 5
21+
# expected = [1.5, 1.5, 4.0 / 3.0, 4.0 / 3.0, 4.0 / 3.0, 5.0, math.nan]
22+
23+
# # Compare allowing NaN
24+
# for got, want in zip(encoded.tolist(), expected):
25+
# if math.isnan(want):
26+
# assert math.isnan(got)
27+
# else:
28+
# assert got == pytest.approx(want, rel=1e-9, abs=1e-12)
29+
30+
31+
# def test_target_mean_encoder_serving_with_context_mapping():
32+
# # Only feature values; label not available at serving, pass all-NaN series
33+
# feature = pd.Series(["a", "x", None, "b", "c"])
34+
# dummy_label = pd.Series([np.nan] * len(feature))
35+
36+
# # Provide mapping via transformation context
37+
# mapping = {"a": 1.5, "b": 1.25}
38+
# global_mean = 2.0
39+
# target_mean_encoder.transformation_context = {
40+
# "target_means": mapping,
41+
# "global_mean": global_mean,
42+
# }
43+
44+
# fn = target_mean_encoder.get_udf(online=True)
45+
# encoded = fn(feature, dummy_label)
46+
47+
# # a -> 1.5 (in mapping)
48+
# # x -> unseen -> fallback to global_mean (2.0)
49+
# # None -> NaN
50+
# # b -> 1.25 (in mapping)
51+
# # c -> unseen -> fallback to global_mean (2.0)
52+
# expected = [1.5, 2.0, math.nan, 1.25, 2.0]
53+
54+
# for got, want in zip(encoded.tolist(), expected):
55+
# if math.isnan(want):
56+
# assert math.isnan(got)
57+
# else:
58+
# assert got == pytest.approx(want, rel=1e-9, abs=1e-12)

0 commit comments

Comments
 (0)