@@ -378,67 +378,67 @@ def winsorize(
378378 return pd .Series (clipped , index = feature .index )
379379
380380
381- @udf (float , drop = ["feature" ], mode = "pandas" )
382- def target_mean_encoder (
383- feature : pd .Series ,
384- label : pd .Series ,
385- statistics = feature_statistics ,
386- context : dict | None = None ,
387- ) -> pd .Series :
388- """
389- Target / Mean Encoding for categorical features.
390-
391- Replaces each category in `feature` with the mean of the target variable `label`.
392-
393- Usage notes:
394- - During training (offline): provide both `feature` and `label`; the encoder computes
395- the per-category mean on-the-fly from these two Series.
396- - During serving/online or when labels are unavailable: provide a precomputed mapping via
397- the transformation context as `{"target_means": {category: mean, ...}, "global_mean": <float>}`.
398- Unseen categories fall back to `global_mean` when provided, otherwise NaN.
399- - Only the input `feature` column is dropped. The `label` column is preserved in outputs.
400-
401- Edge cases:
402- - If `label` is entirely null or not provided (e.g., serving), a context mapping is required.
403- - If `feature` contains NaN, the encoded value will be NaN for those rows.
404- """
405-
406- # Ensure pandas Series with appropriate dtype
407- f = feature
408- y = label if label is not None else None
409-
410- mapping : dict | None = None
411- global_mean : float | None = None
412-
413- if isinstance (context , dict ):
414- mapping = context .get ("target_means" ) or context .get ("mapping" )
415- global_mean = context .get ("global_mean" )
416-
417- # Training/offline path: compute mapping from data if label provided and non-empty
418- if y is not None and not (isinstance (y , pd .Series ) and y .isna ().all ()):
419- # Attempt numeric conversion for label; errors='coerce' will turn non-numeric into NaN
420- y_num = pd .to_numeric (y , errors = "coerce" )
421- # Compute category -> mean(label)
422- df = pd .DataFrame ({"__cat__" : f , "__y__" : y_num })
423- means = df .groupby ("__cat__" )["__y__" ].mean ()
424- mapping = means .to_dict ()
425- # Global mean for fallback on unseen categories at serve-time
426- global_mean = float (y_num .mean ()) if not pd .isna (y_num .mean ()) else None
427-
428- if mapping is None :
429- # No mapping available: try to use just global mean for all known categories
430- if global_mean is not None :
431- return pd .Series (
432- [global_mean if pd .notna (v ) else np .nan for v in f ], index = f .index
433- )
434- # As a last resort, return NaNs (cannot encode)
435- return pd .Series ([np .nan for _ in f ], index = f .index )
436-
437- # Map categories to target means; unseen -> global_mean (if provided) else NaN
438- def _map_val (v ):
439- if pd .isna (v ):
440- return np .nan
441- return mapping .get (v , global_mean )
442-
443- encoded = f .map (_map_val )
444- return pd .Series (encoded , index = f .index )
381+ # @udf(float, drop=["feature"], mode="pandas")
382+ # def target_mean_encoder(
383+ # feature: pd.Series,
384+ # label: pd.Series,
385+ # statistics=feature_statistics,
386+ # context: dict | None = None,
387+ # ) -> pd.Series:
388+ # """
389+ # Target / Mean Encoding for categorical features.
390+
391+ # Replaces each category in `feature` with the mean of the target variable `label`.
392+
393+ # Usage notes:
394+ # - During training (offline): provide both `feature` and `label`; the encoder computes
395+ # the per-category mean on-the-fly from these two Series.
396+ # - During serving/online or when labels are unavailable: provide a precomputed mapping via
397+ # the transformation context as `{"target_means": {category: mean, ...}, "global_mean": <float>}`.
398+ # Unseen categories fall back to `global_mean` when provided, otherwise NaN.
399+ # - Only the input `feature` column is dropped. The `label` column is preserved in outputs.
400+
401+ # Edge cases:
402+ # - If `label` is entirely null or not provided (e.g., serving), a context mapping is required.
403+ # - If `feature` contains NaN, the encoded value will be NaN for those rows.
404+ # """
405+
406+ # # Ensure pandas Series with appropriate dtype
407+ # f = feature
408+ # y = label if label is not None else None
409+
410+ # mapping: dict | None = None
411+ # global_mean: float | None = None
412+
413+ # if isinstance(context, dict):
414+ # mapping = context.get("target_means") or context.get("mapping")
415+ # global_mean = context.get("global_mean")
416+
417+ # # Training/offline path: compute mapping from data if label provided and non-empty
418+ # if y is not None and not (isinstance(y, pd.Series) and y.isna().all()):
419+ # # Attempt numeric conversion for label; errors='coerce' will turn non-numeric into NaN
420+ # y_num = pd.to_numeric(y, errors="coerce")
421+ # # Compute category -> mean(label)
422+ # df = pd.DataFrame({"__cat__": f, "__y__": y_num})
423+ # means = df.groupby("__cat__")["__y__"].mean()
424+ # mapping = means.to_dict()
425+ # # Global mean for fallback on unseen categories at serve-time
426+ # global_mean = float(y_num.mean()) if not pd.isna(y_num.mean()) else None
427+
428+ # if mapping is None:
429+ # # No mapping available: try to use just global mean for all known categories
430+ # if global_mean is not None:
431+ # return pd.Series(
432+ # [global_mean if pd.notna(v) else np.nan for v in f], index=f.index
433+ # )
434+ # # As a last resort, return NaNs (cannot encode)
435+ # return pd.Series([np.nan for _ in f], index=f.index)
436+
437+ # # Map categories to target means; unseen -> global_mean (if provided) else NaN
438+ # def _map_val(v):
439+ # if pd.isna(v):
440+ # return np.nan
441+ # return mapping.get(v, global_mean)
442+
443+ # encoded = f.map(_map_val)
444+ # return pd.Series(encoded, index=f.index)
0 commit comments