cleanup

jimdowling · jimdowling · commit 1f50c23c5726 · 2025-12-03T16:54:25.000+01:00
diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
@@ -325,56 +325,63 @@ def rank_normalizer(feature: pd.Series, statistics=feature_statistics) -> pd.Ser
     return pd.Series(result, index=feature.index)
 
 
-@udf(float, drop=["feature"], mode="pandas")
+@udf(float, mode="pandas")
 def winsorize(
     feature: pd.Series, statistics=feature_statistics, context: dict | None = None
 ) -> pd.Series:
     """
     Winsorization (clipping) to limit extreme values and reduce outlier influence.
 
-    By default, clips values to the [1st, 99th] percentiles computed on the
-    training data. You can override thresholds by passing a context with
-    keys `p_low` and `p_high` (percentile values in [0, 100]).
+    Row-size preserving: outliers are replaced with percentile boundary
+    values instead of removing rows.
 
-    Example to clip at [5th, 95th]:
-        tf = winsorize("feature")
-        tf.hopsworks_udf.transformation_context = {"p_low": 5, "p_high": 95}
+    Defaults to [1st, 99th] percentiles unless overridden via context:
+      {"p_low": 5, "p_high": 95}
     """
     numerical_feature = feature.astype("float64")
     percentiles = statistics.feature.percentiles
 
-    # Defaults: 1st and 99th percentiles
+    print("1")
+    # Defaults: 1 and 99 percentiles
     p_low = 1
     p_high = 99
     if isinstance(context, dict):
         p_low = context.get("p_low", p_low)
         p_high = context.get("p_high", p_high)
 
-    # Convert percentile values to array indices
-    # Since percentiles[i] = i-th percentile, we use the value directly as index
+    print("2")
     try:
         li = int(round(float(p_low)))
         ui = int(round(float(p_high)))
     except Exception:
-        li, ui = 1, 99  # Default fallback
+        li, ui = 1, 99
 
-    # Ensure indices are within valid range [0, len(percentiles)-1]
+    print("3")
+    # Bound indices
     max_idx = len(percentiles) - 1
     li = max(0, min(max_idx, li))
     ui = max(0, min(max_idx, ui))
 
-    # Ensure lower index < upper index
+    print("4")
+
+    # Ensure li < ui
     if li >= ui:
         li, ui = 1, min(99, max_idx)
 
     lower = percentiles[li]
     upper = percentiles[ui]
 
-    # Ensure proper ordering and finiteness
+    print("5")
+    # Invalid bounds → return unchanged
     if pd.isna(lower) or pd.isna(upper) or lower > upper:
-        return numerical_feature  # no-op if invalid thresholds
+        return numerical_feature
+
+    print("6")
+    # Winsorize (no rows dropped)
+    clipped = numerical_feature.where(numerical_feature >= lower, lower).where(
+        numerical_feature <= upper, upper
+    )
 
-    clipped = numerical_feature.clip(lower=lower, upper=upper)
     return pd.Series(clipped, index=feature.index)
 
 
diff --git a/python/tests/test_builtin_log_transform.py b/python/tests/test_builtin_log_transform.py
@@ -17,6 +17,7 @@
 import math
 
 import pandas as pd
+from hsfs import engine as hopsworks_engine
 from hsfs import transformation_function
 from hsfs.engine import python as python_engine
 from hsfs.transformation_function import TransformationType
@@ -40,6 +41,7 @@ def test_log_transform_python_engine():
     )
 
     engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
 
     # Act
     result = engine._apply_transformation_function([tf], df)
@@ -50,6 +52,7 @@ def test_log_transform_python_engine():
         "log_transform_col_0_",
     ]
     expected = pd.Series([0.0, math.log(2.0), math.nan, math.nan, math.nan])
+    expected.name = "log_transform_col_0_"
     pd.testing.assert_series_equal(
         result["log_transform_col_0_"], expected, check_names=True, check_dtype=False
     )
diff --git a/python/tests/test_builtin_quantile_rank.py b/python/tests/test_builtin_quantile_rank.py
@@ -16,6 +16,7 @@
 
 import pandas as pd
 import pytest
+from hsfs import engine as hopsworks_engine
 from hsfs import transformation_function
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 from hsfs.engine import python as python_engine
@@ -47,6 +48,7 @@ def test_quantile_transformer():
     ]
 
     engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
 
     # Act
     result = engine._apply_transformation_function([tf], df)
@@ -85,6 +87,7 @@ def test_rank_normalizer():
     ]
 
     engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
 
     # Act
     result = engine._apply_transformation_function([tf], df)
@@ -123,6 +126,7 @@ def test_quantile_transformer_handles_nan():
     ]
 
     engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
 
     # Act
     result = engine._apply_transformation_function([tf], df)
diff --git a/python/tests/test_builtin_winsorize.py b/python/tests/test_builtin_winsorize.py
@@ -1,33 +1,58 @@
 import math
 
 import pandas as pd
+from hsfs import engine as hopsworks_engine
 from hsfs.builtin_transformations import winsorize
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
+from hsfs.engine import python as python_engine
+from python.hsfs import transformation_function
+from python.hsfs.transformation_function import TransformationType
 
 
 def _set_percentiles(pcts):
-    # Attach transformation statistics to the winsorize UDF so the injected `statistics` is available in scope
     winsorize.transformation_statistics = [
         FeatureDescriptiveStatistics(feature_name="feature", percentiles=pcts)
     ]
+    winsorize.transformation_context = None  # reset each test
+
+
+def _run_raw(series: pd.Series) -> pd.Series:
+    """Call the raw UDF function directly so input stays a Series."""
+    fn = winsorize.get_udf(online=True)
+
+    return fn(series)
 
 
 def test_winsorize_default_thresholds():
-    # Build a simple monotonic percentile array so index == value
-    percentiles = list(range(100))  # 0..99
+    engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
+
+    percentiles = list(range(100))  # [0..99]
     _set_percentiles(percentiles)
 
-    # Input data within and outside bounds
     s = pd.Series([0.0, 1.0, 50.0, 99.0, 120.0, math.nan])
 
-    # Get callable for Python execution
-    fn = winsorize.get_udf(online=True)
-    out = fn(s)
+    tf = transformation_function.TransformationFunction(
+        hopsworks_udf=winsorize("col_0"),
+        featurestore_id=1,
+        transformation_type=TransformationType.MODEL_DEPENDENT,
+    )
+
+    percentiles = [float(i) for i in range(100)]
+    tf.transformation_statistics = [
+        FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles)
+    ]
+
+    engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
+
+    # Act
+    out = engine._apply_transformation_function([tf], s)
+
+    out = _run_raw(s)
 
-    # Default thresholds: [1st, 99th] => [1.0, 99.0]
     expected = pd.Series([1.0, 1.0, 50.0, 99.0, 99.0, math.nan])
 
-    # Compare element-wise allowing NaNs
     for got, want in zip(out.tolist(), expected.tolist()):
         if math.isnan(want):
             assert math.isnan(got)
@@ -36,16 +61,17 @@ def test_winsorize_default_thresholds():
 
 
 def test_winsorize_context_override():
+    engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
+
     percentiles = list(range(100))
     _set_percentiles(percentiles)
 
     s = pd.Series([2.0, 5.0, 95.0, 96.0, math.nan])
 
-    # Override to [5th, 95th]
     winsorize.transformation_context = {"p_low": 5, "p_high": 95}
 
-    fn = winsorize.get_udf(online=True)
-    out = fn(s)
+    out = _run_raw(s)
 
     expected = pd.Series([5.0, 5.0, 95.0, 95.0, math.nan])
 

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`import math`
`18`	`18`
`19`	`19`	`import pandas as pd`
	`20`	`+from hsfs import engine as hopsworks_engine`
`20`	`21`	`from hsfs import transformation_function`
`21`	`22`	`from hsfs.engine import python as python_engine`
`22`	`23`	`from hsfs.transformation_function import TransformationType`
`@@ -40,6 +41,7 @@ def test_log_transform_python_engine():`
`40`	`41`	`)`
`41`	`42`
`42`	`43`	`engine = python_engine.Engine()`
	`44`	`+ hopsworks_engine.set_instance(engine=engine, engine_type="python")`
`43`	`45`
`44`	`46`	`# Act`
`45`	`47`	`result = engine._apply_transformation_function([tf], df)`
`@@ -50,6 +52,7 @@ def test_log_transform_python_engine():`
`50`	`52`	`"log_transform_col_0_",`
`51`	`53`	`]`
`52`	`54`	`expected = pd.Series([0.0, math.log(2.0), math.nan, math.nan, math.nan])`
	`55`	`+ expected.name = "log_transform_col_0_"`
`53`	`56`	`pd.testing.assert_series_equal(`
`54`	`57`	`result["log_transform_col_0_"], expected, check_names=True, check_dtype=False`
`55`	`58`	`)`