Skip to content

Commit 1260fc6

Browse files
ENH: Add preprocessing module with scaling functions (#35)
* ENH: Initialize preprocessing module * ENH: Add minmax_scale and standardize functions * ENH: Add apply_scaling function to select scaler by name * TST: Add unit tests for scalers
1 parent dd665bf commit 1260fc6

File tree

4 files changed

+401
-0
lines changed

4 files changed

+401
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""Preprocessing module."""
2+
3+
from .scalers import apply_scaling, minmax_scale, standardize
4+
5+
__all__ = ["apply_scaling", "minmax_scale", "standardize"]
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
"""Data scaling functions."""
2+
3+
from scipy import sparse
4+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
5+
from sklearn.utils.validation import check_array
6+
7+
8+
def _validate_and_align(X_train, X_test):
9+
"""Validate arrays as numeric 2D matrices and ensure matching feature counts.
10+
11+
Parameters
12+
----------
13+
X_train : array-like of shape (n_samples, n_features)
14+
Training feature matrix used for validation reference.
15+
16+
X_test : array-like of shape (m_samples, n_features), optional
17+
Test feature matrix to validate against training matrix.
18+
19+
Returns
20+
-------
21+
(X_train_valid, X_test_valid) : tuple
22+
Validated arrays. X_test_valid is None if X_test is None.
23+
24+
Raises
25+
------
26+
ValueError
27+
If X_test has different number of features than X_train.
28+
29+
"""
30+
X_train = check_array(X_train, accept_sparse=True, dtype="numeric")
31+
if X_test is not None:
32+
X_test = check_array(X_test, accept_sparse=True, dtype="numeric")
33+
if X_test.shape[1] != X_train.shape[1]:
34+
raise ValueError(
35+
f"X_test has {X_test.shape[1]} features but X_train has {X_train.shape[1]}."
36+
)
37+
return X_train, X_test
38+
39+
40+
def apply_scaling(X_train, X_test=None, method=None, return_transformer=False):
41+
"""Apply normalization or standardization to the input data.
42+
43+
The preprocessing is fit on the training data and then applied to both
44+
training and test data (if provided).
45+
46+
Parameters
47+
----------
48+
X_train : array-like of shape (n_samples, n_features)
49+
Feature matrix used specifically for model training.
50+
51+
X_test : array-like of shape (m_samples, n_features), optional
52+
Feature matrix used for model evaluation and prediction.
53+
54+
method : {"norm", "std", None}, optional
55+
- "norm": Min-Max scaling to [0, 1]
56+
- "std" : Standardization (mean=0, std=1)
57+
- None : No preprocessing
58+
59+
return_transformer : bool, default=False
60+
If True, also return the fitted scaling object.
61+
62+
Returns
63+
-------
64+
(X_train_scaled, X_test_scaled) or (X_train_scaled, X_test_scaled, scaler)
65+
Scaled arrays; X_test_scaled is None if X_test is None.
66+
If return_transformer=True and method=None, scaler is None.
67+
68+
Raises
69+
------
70+
ValueError
71+
If an unknown scaling method is specified.
72+
73+
Examples
74+
--------
75+
>>> import numpy as np
76+
>>> from preprocessing.scalers import apply_scaling
77+
>>> X_train = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]])
78+
>>> X_test = np.array([[2.5, 5.0]])
79+
>>> X_train_scaled, X_test_scaled = apply_scaling(X_train, X_test, method="norm")
80+
>>> X_train_scaled
81+
array([[0. , 0. ],
82+
[0.5, 0.5],
83+
[1. , 1. ]])
84+
>>> X_test_scaled
85+
array([[0.75, 0.75]])
86+
>>> X_train_scaled, X_test_scaled = apply_scaling(X_train, X_test, method="std")
87+
>>> X_train_scaled.round(3)
88+
array([[-1.225, -1.225],
89+
[ 0. , 0. ],
90+
[ 1.225, 1.225]])
91+
>>> X_test_scaled.round(3)
92+
array([[0.612, 0.612]])
93+
94+
"""
95+
if method is None:
96+
return (X_train, X_test, None) if return_transformer else (X_train, X_test)
97+
98+
if not isinstance(method, str):
99+
raise ValueError("Scaling method must be a string or None.")
100+
101+
key = method.lower()
102+
if key == "norm":
103+
return minmax_scale(X_train, X_test, return_transformer)
104+
elif key == "std":
105+
return standardize(X_train, X_test, return_transformer)
106+
else:
107+
raise ValueError(
108+
f"Unknown scaling method '{method}'. Valid options: 'norm', 'std', None."
109+
)
110+
111+
112+
def minmax_scale(X_train, X_test=None, return_transformer=False):
113+
"""Scale features to a fixed range between 0 and 1.
114+
115+
Fits scaling parameters on training data and applies the same transformation
116+
to both training and test sets.
117+
118+
Parameters
119+
----------
120+
X_train : array-like of shape (n_samples, n_features)
121+
Training feature matrix used to fit scaling parameters.
122+
123+
X_test : array-like of shape (m_samples, n_features), optional
124+
Test feature matrix to transform using fitted parameters.
125+
126+
return_transformer : bool, default=False
127+
If True, also return the fitted scaling object.
128+
129+
Returns
130+
-------
131+
(X_train_scaled, X_test_scaled) or (X_train_scaled, X_test_scaled, scaler)
132+
Scaled arrays; X_test_scaled is None if X_test is None.
133+
134+
Raises
135+
------
136+
ValueError
137+
If X_test has a different number of features than X_train.
138+
139+
Examples
140+
--------
141+
>>> import numpy as np
142+
>>> from preprocessing.scalers import minmax_scale
143+
>>> X_train = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]])
144+
>>> X_test = np.array([[2.5, 5.0]])
145+
>>> X_train_scaled, X_test_scaled = minmax_scale(X_train, X_test)
146+
>>> X_train_scaled
147+
array([[0. , 0. ],
148+
[0.5, 0.5],
149+
[1. , 1. ]])
150+
>>> X_test_scaled
151+
array([[0.75, 0.75]])
152+
153+
"""
154+
X_train, X_test = _validate_and_align(X_train, X_test)
155+
156+
scaler = MinMaxScaler(feature_range=(0.0, 1.0))
157+
X_train_scaled = scaler.fit_transform(X_train)
158+
X_test_scaled = scaler.transform(X_test) if X_test is not None else None
159+
160+
if return_transformer:
161+
return X_train_scaled, X_test_scaled, scaler
162+
else:
163+
return X_train_scaled, X_test_scaled
164+
165+
166+
def standardize(X_train, X_test=None, return_transformer=False):
167+
"""Standardize features to have zero mean and unit variance.
168+
169+
Fits scaling parameters on training data and applies the same transformation
170+
to both training and test sets. For sparse matrices, centering is disabled
171+
to preserve sparsity.
172+
173+
Parameters
174+
----------
175+
X_train : array-like of shape (n_samples, n_features)
176+
Feature matrix used specifically for model training.
177+
178+
X_test : array-like of shape (m_samples, n_features), optional
179+
Test feature matrix to transform using fitted parameters.
180+
181+
return_transformer: bool, default=False
182+
If True, also return the fitted scaling object.
183+
184+
Returns
185+
-------
186+
(X_train_scaled, X_test_scaled) or (X_train_scaled, X_test_scaled, scaler)
187+
Scaled arrays; X_test_scaled is None if X_test is None.
188+
189+
Raises
190+
------
191+
ValueError
192+
If X_test has a different number of features than X_train.
193+
194+
Examples
195+
--------
196+
>>> import numpy as np
197+
>>> from preprocessing.scalers import standardize
198+
>>> X_train = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]])
199+
>>> X_test = np.array([[2.5, 5.0]])
200+
>>> X_train_scaled, X_test_scaled = standardize(X_train, X_test)
201+
>>> X_train_scaled.round(3)
202+
array([[-1.225, -1.225],
203+
[ 0. , 0. ],
204+
[ 1.225, 1.225]])
205+
>>> X_test_scaled.round(3)
206+
array([[0.612, 0.612]])
207+
208+
"""
209+
X_train, X_test = _validate_and_align(X_train, X_test)
210+
211+
scaler = (
212+
StandardScaler(with_mean=False)
213+
if sparse.issparse(X_train)
214+
else StandardScaler()
215+
)
216+
X_train_scaled = scaler.fit_transform(X_train)
217+
X_test_scaled = scaler.transform(X_test) if X_test is not None else None
218+
219+
if return_transformer:
220+
return X_train_scaled, X_test_scaled, scaler
221+
else:
222+
return X_train_scaled, X_test_scaled
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Tests for the preprocessing module."""
2+
3+
__all__ = []

0 commit comments

Comments
 (0)