fixed spark unit tests

jimdowling · jimdowling · commit 24be64da9117 · 2025-12-03T16:54:25.000+01:00
diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
@@ -48,9 +48,9 @@ def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Serie
     If IQR is zero (constant feature), the function centers the data by the
     median without scaling to avoid division by zero.
     """
-    q1 = statistics.feature.percentiles[25]
-    q2 = statistics.feature.percentiles[50]
-    q3 = statistics.feature.percentiles[75]
+    q1 = statistics.feature.percentiles[24]
+    q2 = statistics.feature.percentiles[49]
+    q3 = statistics.feature.percentiles[74]
     iqr = q3 - q1
 
     scaled_feature = feature.astype("float64")
@@ -168,9 +168,9 @@ def equal_frequency_binner(
     - NaN inputs remain NaN.
     """
     s = feature.astype("float64")
-    q1 = statistics.feature.percentiles[25]
-    q2 = statistics.feature.percentiles[50]
-    q3 = statistics.feature.percentiles[75]
+    q1 = statistics.feature.percentiles[24]
+    q2 = statistics.feature.percentiles[49]
+    q3 = statistics.feature.percentiles[74]
 
     # Check if we have valid quartiles
     if any(pd.isna([q1, q2, q3])):
@@ -217,9 +217,9 @@ def quantile_binner(feature: pd.Series, statistics=feature_statistics) -> pd.Ser
 
     # Use quartiles: 25th, 50th, 75th percentiles
     p = statistics.feature.percentiles
-    q25 = p[25]  # Q1
-    q50 = p[50]  # Q2 (median)
-    q75 = p[75]  # Q3
+    q25 = p[24]  # Q1
+    q50 = p[49]  # Q2 (median)
+    q75 = p[74]  # Q3
 
     # Check if we have valid quartiles
     if any(pd.isna([q25, q50, q75])):
@@ -325,7 +325,7 @@ def rank_normalizer(feature: pd.Series, statistics=feature_statistics) -> pd.Ser
     return pd.Series(result, index=feature.index)
 
 
-@udf(float, mode="pandas")
+@udf(float, drop=["feature"], mode="pandas")
 def winsorize(
     feature: pd.Series, statistics=feature_statistics, context: dict | None = None
 ) -> pd.Series:
@@ -341,46 +341,37 @@ def winsorize(
     numerical_feature = feature.astype("float64")
     percentiles = statistics.feature.percentiles
 
-    print("1")
     # Defaults: 1 and 99 percentiles
     p_low = 1
     p_high = 99
     if isinstance(context, dict):
         p_low = context.get("p_low", p_low)
         p_high = context.get("p_high", p_high)
 
-    print("2")
     try:
         li = int(round(float(p_low)))
         ui = int(round(float(p_high)))
     except Exception:
         li, ui = 1, 99
 
-    print("3")
     # Bound indices
     max_idx = len(percentiles) - 1
     li = max(0, min(max_idx, li))
     ui = max(0, min(max_idx, ui))
 
-    print("4")
-
     # Ensure li < ui
     if li >= ui:
         li, ui = 1, min(99, max_idx)
 
     lower = percentiles[li]
     upper = percentiles[ui]
 
-    print("5")
     # Invalid bounds → return unchanged
     if pd.isna(lower) or pd.isna(upper) or lower > upper:
         return numerical_feature
 
-    print("6")
-    # Winsorize (no rows dropped)
-    clipped = numerical_feature.where(numerical_feature >= lower, lower).where(
-        numerical_feature <= upper, upper
-    )
+    # Winsorize (no rows dropped), preserving NaN values
+    clipped = numerical_feature.clip(lower=lower, upper=upper)
 
     return pd.Series(clipped, index=feature.index)
 
diff --git a/python/tests/test_builtin_winsorize.py b/python/tests/test_builtin_winsorize.py
@@ -2,43 +2,30 @@
 
 import pandas as pd
 from hsfs import engine as hopsworks_engine
+from hsfs import transformation_function
 from hsfs.builtin_transformations import winsorize
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 from hsfs.engine import python as python_engine
-from python.hsfs import transformation_function
-from python.hsfs.transformation_function import TransformationType
-
-
-def _set_percentiles(pcts):
-    winsorize.transformation_statistics = [
-        FeatureDescriptiveStatistics(feature_name="feature", percentiles=pcts)
-    ]
-    winsorize.transformation_context = None  # reset each test
-
-
-def _run_raw(series: pd.Series) -> pd.Series:
-    """Call the raw UDF function directly so input stays a Series."""
-    fn = winsorize.get_udf(online=True)
-
-    return fn(series)
+from hsfs.transformation_function import TransformationType
 
 
 def test_winsorize_default_thresholds():
-    engine = python_engine.Engine()
-    hopsworks_engine.set_instance(engine=engine, engine_type="python")
-
-    percentiles = list(range(100))  # [0..99]
-    _set_percentiles(percentiles)
-
-    s = pd.Series([0.0, 1.0, 50.0, 99.0, 120.0, math.nan])
+    # Arrange
+    df = pd.DataFrame(
+        {
+            "col_0": [0.0, 1.0, 50.0, 99.0, 120.0, math.nan],
+            "other": ["a", "b", "c", "d", "e", "f"],
+        }
+    )
 
     tf = transformation_function.TransformationFunction(
         hopsworks_udf=winsorize("col_0"),
         featurestore_id=1,
         transformation_type=TransformationType.MODEL_DEPENDENT,
     )
 
-    percentiles = [float(i) for i in range(100)]
+    # Percentiles from 0th to 100th (101 values)
+    percentiles = [float(i) for i in range(101)]
     tf.transformation_statistics = [
         FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles)
     ]
@@ -47,35 +34,55 @@ def test_winsorize_default_thresholds():
     hopsworks_engine.set_instance(engine=engine, engine_type="python")
 
     # Act
-    out = engine._apply_transformation_function([tf], s)
-
-    out = _run_raw(s)
+    result = engine._apply_transformation_function([tf], df)
 
+    # Assert - defaults clip at 1st and 99th percentiles (values 1 and 99)
+    assert list(result.columns) == ["other", "winsorize_col_0_"]
     expected = pd.Series([1.0, 1.0, 50.0, 99.0, 99.0, math.nan])
+    expected.name = "winsorize_col_0_"
 
-    for got, want in zip(out.tolist(), expected.tolist()):
+    for got, want in zip(result["winsorize_col_0_"].tolist(), expected.tolist()):
         if math.isnan(want):
             assert math.isnan(got)
         else:
             assert got == want
 
 
 def test_winsorize_context_override():
-    engine = python_engine.Engine()
-    hopsworks_engine.set_instance(engine=engine, engine_type="python")
+    # Arrange
+    df = pd.DataFrame(
+        {
+            "col_0": [2.0, 5.0, 95.0, 96.0, math.nan],
+            "other": ["a", "b", "c", "d", "e"],
+        }
+    )
 
-    percentiles = list(range(100))
-    _set_percentiles(percentiles)
+    tf = transformation_function.TransformationFunction(
+        hopsworks_udf=winsorize("col_0"),
+        featurestore_id=1,
+        transformation_type=TransformationType.MODEL_DEPENDENT,
+    )
 
-    s = pd.Series([2.0, 5.0, 95.0, 96.0, math.nan])
+    # Percentiles from 0th to 100th (101 values)
+    percentiles = [float(i) for i in range(101)]
+    tf.transformation_statistics = [
+        FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles)
+    ]
 
-    winsorize.transformation_context = {"p_low": 5, "p_high": 95}
+    engine = python_engine.Engine()
+    hopsworks_engine.set_instance(engine=engine, engine_type="python")
 
-    out = _run_raw(s)
+    # Act - Override percentile thresholds via context parameter
+    result = engine._apply_transformation_function(
+        [tf], df, transformation_context={"p_low": 5, "p_high": 95}
+    )
 
+    # Assert - clips at 5th and 95th percentiles (values 5 and 95)
+    assert list(result.columns) == ["other", "winsorize_col_0_"]
     expected = pd.Series([5.0, 5.0, 95.0, 95.0, math.nan])
+    expected.name = "winsorize_col_0_"
 
-    for got, want in zip(out.tolist(), expected.tolist()):
+    for got, want in zip(result["winsorize_col_0_"].tolist(), expected.tolist()):
         if math.isnan(want):
             assert math.isnan(got)
         else: