22
33import pandas as pd
44from hsfs import engine as hopsworks_engine
5+ from hsfs import transformation_function
56from hsfs .builtin_transformations import winsorize
67from hsfs .core .feature_descriptive_statistics import FeatureDescriptiveStatistics
78from hsfs .engine import python as python_engine
8- from python .hsfs import transformation_function
9- from python .hsfs .transformation_function import TransformationType
10-
11-
12- def _set_percentiles (pcts ):
13- winsorize .transformation_statistics = [
14- FeatureDescriptiveStatistics (feature_name = "feature" , percentiles = pcts )
15- ]
16- winsorize .transformation_context = None # reset each test
17-
18-
19- def _run_raw (series : pd .Series ) -> pd .Series :
20- """Call the raw UDF function directly so input stays a Series."""
21- fn = winsorize .get_udf (online = True )
22-
23- return fn (series )
9+ from hsfs .transformation_function import TransformationType
2410
2511
2612def test_winsorize_default_thresholds ():
27- engine = python_engine . Engine ()
28- hopsworks_engine . set_instance ( engine = engine , engine_type = "python" )
29-
30- percentiles = list ( range ( 100 )) # [0..99]
31- _set_percentiles ( percentiles )
32-
33- s = pd . Series ([ 0.0 , 1.0 , 50.0 , 99.0 , 120.0 , math . nan ] )
13+ # Arrange
14+ df = pd . DataFrame (
15+ {
16+ "col_0" : [0.0 , 1.0 , 50.0 , 99.0 , 120.0 , math . nan ],
17+ "other" : [ "a" , "b" , "c" , "d" , "e" , "f" ],
18+ }
19+ )
3420
3521 tf = transformation_function .TransformationFunction (
3622 hopsworks_udf = winsorize ("col_0" ),
3723 featurestore_id = 1 ,
3824 transformation_type = TransformationType .MODEL_DEPENDENT ,
3925 )
4026
41- percentiles = [float (i ) for i in range (100 )]
27+ # Percentiles from 0th to 100th (101 values)
28+ percentiles = [float (i ) for i in range (101 )]
4229 tf .transformation_statistics = [
4330 FeatureDescriptiveStatistics (feature_name = "col_0" , percentiles = percentiles )
4431 ]
@@ -47,35 +34,55 @@ def test_winsorize_default_thresholds():
4734 hopsworks_engine .set_instance (engine = engine , engine_type = "python" )
4835
4936 # Act
50- out = engine ._apply_transformation_function ([tf ], s )
51-
52- out = _run_raw (s )
37+ result = engine ._apply_transformation_function ([tf ], df )
5338
39+ # Assert - defaults clip at 1st and 99th percentiles (values 1 and 99)
40+ assert list (result .columns ) == ["other" , "winsorize_col_0_" ]
5441 expected = pd .Series ([1.0 , 1.0 , 50.0 , 99.0 , 99.0 , math .nan ])
42+ expected .name = "winsorize_col_0_"
5543
56- for got , want in zip (out .tolist (), expected .tolist ()):
44+ for got , want in zip (result [ "winsorize_col_0_" ] .tolist (), expected .tolist ()):
5745 if math .isnan (want ):
5846 assert math .isnan (got )
5947 else :
6048 assert got == want
6149
6250
6351def test_winsorize_context_override ():
64- engine = python_engine .Engine ()
65- hopsworks_engine .set_instance (engine = engine , engine_type = "python" )
52+ # Arrange
53+ df = pd .DataFrame (
54+ {
55+ "col_0" : [2.0 , 5.0 , 95.0 , 96.0 , math .nan ],
56+ "other" : ["a" , "b" , "c" , "d" , "e" ],
57+ }
58+ )
6659
67- percentiles = list (range (100 ))
68- _set_percentiles (percentiles )
60+ tf = transformation_function .TransformationFunction (
61+ hopsworks_udf = winsorize ("col_0" ),
62+ featurestore_id = 1 ,
63+ transformation_type = TransformationType .MODEL_DEPENDENT ,
64+ )
6965
70- s = pd .Series ([2.0 , 5.0 , 95.0 , 96.0 , math .nan ])
66+ # Percentiles from 0th to 100th (101 values)
67+ percentiles = [float (i ) for i in range (101 )]
68+ tf .transformation_statistics = [
69+ FeatureDescriptiveStatistics (feature_name = "col_0" , percentiles = percentiles )
70+ ]
7171
72- winsorize .transformation_context = {"p_low" : 5 , "p_high" : 95 }
72+ engine = python_engine .Engine ()
73+ hopsworks_engine .set_instance (engine = engine , engine_type = "python" )
7374
74- out = _run_raw (s )
75+ # Act - Override percentile thresholds via context parameter
76+ result = engine ._apply_transformation_function (
77+ [tf ], df , transformation_context = {"p_low" : 5 , "p_high" : 95 }
78+ )
7579
80+ # Assert - clips at 5th and 95th percentiles (values 5 and 95)
81+ assert list (result .columns ) == ["other" , "winsorize_col_0_" ]
7682 expected = pd .Series ([5.0 , 5.0 , 95.0 , 95.0 , math .nan ])
83+ expected .name = "winsorize_col_0_"
7784
78- for got , want in zip (out .tolist (), expected .tolist ()):
85+ for got , want in zip (result [ "winsorize_col_0_" ] .tolist (), expected .tolist ()):
7986 if math .isnan (want ):
8087 assert math .isnan (got )
8188 else :
0 commit comments