Skip to content

Commit 1f50c23

Browse files
committed
cleanup
1 parent 43c6286 commit 1f50c23

File tree

4 files changed

+68
-28
lines changed

4 files changed

+68
-28
lines changed

python/hsfs/builtin_transformations.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -325,56 +325,63 @@ def rank_normalizer(feature: pd.Series, statistics=feature_statistics) -> pd.Ser
325325
return pd.Series(result, index=feature.index)
326326

327327

328-
@udf(float, drop=["feature"], mode="pandas")
328+
@udf(float, mode="pandas")
329329
def winsorize(
330330
feature: pd.Series, statistics=feature_statistics, context: dict | None = None
331331
) -> pd.Series:
332332
"""
333333
Winsorization (clipping) to limit extreme values and reduce outlier influence.
334334
335-
By default, clips values to the [1st, 99th] percentiles computed on the
336-
training data. You can override thresholds by passing a context with
337-
keys `p_low` and `p_high` (percentile values in [0, 100]).
335+
Row-size preserving: outliers are replaced with percentile boundary
336+
values instead of removing rows.
338337
339-
Example to clip at [5th, 95th]:
340-
tf = winsorize("feature")
341-
tf.hopsworks_udf.transformation_context = {"p_low": 5, "p_high": 95}
338+
Defaults to [1st, 99th] percentiles unless overridden via context:
339+
{"p_low": 5, "p_high": 95}
342340
"""
343341
numerical_feature = feature.astype("float64")
344342
percentiles = statistics.feature.percentiles
345343

346-
# Defaults: 1st and 99th percentiles
344+
print("1")
345+
# Defaults: 1 and 99 percentiles
347346
p_low = 1
348347
p_high = 99
349348
if isinstance(context, dict):
350349
p_low = context.get("p_low", p_low)
351350
p_high = context.get("p_high", p_high)
352351

353-
# Convert percentile values to array indices
354-
# Since percentiles[i] = i-th percentile, we use the value directly as index
352+
print("2")
355353
try:
356354
li = int(round(float(p_low)))
357355
ui = int(round(float(p_high)))
358356
except Exception:
359-
li, ui = 1, 99 # Default fallback
357+
li, ui = 1, 99
360358

361-
# Ensure indices are within valid range [0, len(percentiles)-1]
359+
print("3")
360+
# Bound indices
362361
max_idx = len(percentiles) - 1
363362
li = max(0, min(max_idx, li))
364363
ui = max(0, min(max_idx, ui))
365364

366-
# Ensure lower index < upper index
365+
print("4")
366+
367+
# Ensure li < ui
367368
if li >= ui:
368369
li, ui = 1, min(99, max_idx)
369370

370371
lower = percentiles[li]
371372
upper = percentiles[ui]
372373

373-
# Ensure proper ordering and finiteness
374+
print("5")
375+
# Invalid bounds → return unchanged
374376
if pd.isna(lower) or pd.isna(upper) or lower > upper:
375-
return numerical_feature # no-op if invalid thresholds
377+
return numerical_feature
378+
379+
print("6")
380+
# Winsorize (no rows dropped)
381+
clipped = numerical_feature.where(numerical_feature >= lower, lower).where(
382+
numerical_feature <= upper, upper
383+
)
376384

377-
clipped = numerical_feature.clip(lower=lower, upper=upper)
378385
return pd.Series(clipped, index=feature.index)
379386

380387

python/tests/test_builtin_log_transform.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import math
1818

1919
import pandas as pd
20+
from hsfs import engine as hopsworks_engine
2021
from hsfs import transformation_function
2122
from hsfs.engine import python as python_engine
2223
from hsfs.transformation_function import TransformationType
@@ -40,6 +41,7 @@ def test_log_transform_python_engine():
4041
)
4142

4243
engine = python_engine.Engine()
44+
hopsworks_engine.set_instance(engine=engine, engine_type="python")
4345

4446
# Act
4547
result = engine._apply_transformation_function([tf], df)
@@ -50,6 +52,7 @@ def test_log_transform_python_engine():
5052
"log_transform_col_0_",
5153
]
5254
expected = pd.Series([0.0, math.log(2.0), math.nan, math.nan, math.nan])
55+
expected.name = "log_transform_col_0_"
5356
pd.testing.assert_series_equal(
5457
result["log_transform_col_0_"], expected, check_names=True, check_dtype=False
5558
)

python/tests/test_builtin_quantile_rank.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import pandas as pd
1818
import pytest
19+
from hsfs import engine as hopsworks_engine
1920
from hsfs import transformation_function
2021
from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
2122
from hsfs.engine import python as python_engine
@@ -47,6 +48,7 @@ def test_quantile_transformer():
4748
]
4849

4950
engine = python_engine.Engine()
51+
hopsworks_engine.set_instance(engine=engine, engine_type="python")
5052

5153
# Act
5254
result = engine._apply_transformation_function([tf], df)
@@ -85,6 +87,7 @@ def test_rank_normalizer():
8587
]
8688

8789
engine = python_engine.Engine()
90+
hopsworks_engine.set_instance(engine=engine, engine_type="python")
8891

8992
# Act
9093
result = engine._apply_transformation_function([tf], df)
@@ -123,6 +126,7 @@ def test_quantile_transformer_handles_nan():
123126
]
124127

125128
engine = python_engine.Engine()
129+
hopsworks_engine.set_instance(engine=engine, engine_type="python")
126130

127131
# Act
128132
result = engine._apply_transformation_function([tf], df)

python/tests/test_builtin_winsorize.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,58 @@
11
import math
22

33
import pandas as pd
4+
from hsfs import engine as hopsworks_engine
45
from hsfs.builtin_transformations import winsorize
56
from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
7+
from hsfs.engine import python as python_engine
8+
from python.hsfs import transformation_function
9+
from python.hsfs.transformation_function import TransformationType
610

711

812
def _set_percentiles(pcts):
9-
# Attach transformation statistics to the winsorize UDF so the injected `statistics` is available in scope
1013
winsorize.transformation_statistics = [
1114
FeatureDescriptiveStatistics(feature_name="feature", percentiles=pcts)
1215
]
16+
winsorize.transformation_context = None # reset each test
17+
18+
19+
def _run_raw(series: pd.Series) -> pd.Series:
20+
"""Call the raw UDF function directly so input stays a Series."""
21+
fn = winsorize.get_udf(online=True)
22+
23+
return fn(series)
1324

1425

1526
def test_winsorize_default_thresholds():
16-
# Build a simple monotonic percentile array so index == value
17-
percentiles = list(range(100)) # 0..99
27+
engine = python_engine.Engine()
28+
hopsworks_engine.set_instance(engine=engine, engine_type="python")
29+
30+
percentiles = list(range(100)) # [0..99]
1831
_set_percentiles(percentiles)
1932

20-
# Input data within and outside bounds
2133
s = pd.Series([0.0, 1.0, 50.0, 99.0, 120.0, math.nan])
2234

23-
# Get callable for Python execution
24-
fn = winsorize.get_udf(online=True)
25-
out = fn(s)
35+
tf = transformation_function.TransformationFunction(
36+
hopsworks_udf=winsorize("col_0"),
37+
featurestore_id=1,
38+
transformation_type=TransformationType.MODEL_DEPENDENT,
39+
)
40+
41+
percentiles = [float(i) for i in range(100)]
42+
tf.transformation_statistics = [
43+
FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles)
44+
]
45+
46+
engine = python_engine.Engine()
47+
hopsworks_engine.set_instance(engine=engine, engine_type="python")
48+
49+
# Act
50+
out = engine._apply_transformation_function([tf], s)
51+
52+
out = _run_raw(s)
2653

27-
# Default thresholds: [1st, 99th] => [1.0, 99.0]
2854
expected = pd.Series([1.0, 1.0, 50.0, 99.0, 99.0, math.nan])
2955

30-
# Compare element-wise allowing NaNs
3156
for got, want in zip(out.tolist(), expected.tolist()):
3257
if math.isnan(want):
3358
assert math.isnan(got)
@@ -36,16 +61,17 @@ def test_winsorize_default_thresholds():
3661

3762

3863
def test_winsorize_context_override():
64+
engine = python_engine.Engine()
65+
hopsworks_engine.set_instance(engine=engine, engine_type="python")
66+
3967
percentiles = list(range(100))
4068
_set_percentiles(percentiles)
4169

4270
s = pd.Series([2.0, 5.0, 95.0, 96.0, math.nan])
4371

44-
# Override to [5th, 95th]
4572
winsorize.transformation_context = {"p_low": 5, "p_high": 95}
4673

47-
fn = winsorize.get_udf(online=True)
48-
out = fn(s)
74+
out = _run_raw(s)
4975

5076
expected = pd.Series([5.0, 5.0, 95.0, 95.0, math.nan])
5177

0 commit comments

Comments
 (0)