# Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause import re import warnings import numpy as np import numpy.linalg as la import pytest from scipy import sparse, stats from sklearn import config_context, datasets from sklearn.base import clone from sklearn.exceptions import NotFittedError from sklearn.externals._packaging.version import parse as parse_version from sklearn.metrics.pairwise import linear_kernel from sklearn.model_selection import cross_val_predict from sklearn.pipeline import Pipeline from sklearn.preprocessing import ( Binarizer, KernelCenterer, MaxAbsScaler, MinMaxScaler, Normalizer, PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler, add_dummy_feature, maxabs_scale, minmax_scale, normalize, power_transform, quantile_transform, robust_scale, scale, ) from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale from sklearn.svm import SVR from sklearn.utils import gen_batches, shuffle from sklearn.utils._array_api import ( _convert_to_numpy, _get_namespace_device_dtype_ids, yield_namespace_device_dtype_combinations, ) from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import ( _array_api_for_tests, _convert_container, assert_allclose, assert_allclose_dense_sparse, assert_almost_equal, assert_array_almost_equal, assert_array_equal, assert_array_less, skip_if_32bit, ) from sklearn.utils.estimator_checks import ( check_array_api_input_and_values, ) from sklearn.utils.fixes import ( COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS, sp_version, ) from sklearn.utils.sparsefuncs import mean_variance_axis iris = datasets.load_iris() # Make some data to be used many times rng = np.random.RandomState(0) n_features = 30 n_samples = 1000 offsets = rng.uniform(-1, 1, size=n_features) scales = rng.uniform(1, 10, size=n_features) X_2d = rng.randn(n_samples, n_features) * scales + offsets X_1row = X_2d[0, :].reshape(1, n_features) X_1col = X_2d[:, 0].reshape(n_samples, 1) X_list_1row = X_1row.tolist() X_list_1col = X_1col.tolist() def toarray(a): if hasattr(a, "toarray"): a = a.toarray() return a def _check_dim_1axis(a): return np.asarray(a).shape[0] def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen): if batch_stop != n: assert (i + 1) * chunk_size == n_samples_seen else: assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D n_sampless = [2, 3] n_featuress = [3, 2] for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) scaler = StandardScaler() # make sure Error is raised the sample weights greater than 1d sample_weight_notOK = rng.randn(n_samples, 1) ** 2 with pytest.raises(ValueError): scaler.fit(X, y, sample_weight=sample_weight_notOK) @pytest.mark.parametrize( ["Xw", "X", "sample_weight"], [ ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]), ( [[1, 0, 1], [0, 0, 1]], [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]], np.array([1, 3]), ), ( [[1, np.nan, 1], [np.nan, np.nan, 1]], [ [1, np.nan, 1], [np.nan, np.nan, 1], [np.nan, np.nan, 1], [np.nan, np.nan, 1], ], np.array([1, 3]), ), ], ) @pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"]) def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor): with_mean = not array_constructor.startswith("sparse") X = _convert_container(X, array_constructor) Xw = _convert_container(Xw, array_constructor) # weighted StandardScaler yw = np.ones(Xw.shape[0]) scaler_w = StandardScaler(with_mean=with_mean) scaler_w.fit(Xw, yw, sample_weight=sample_weight) # unweighted, but with repeated samples y = np.ones(X.shape[0]) scaler = StandardScaler(with_mean=with_mean) scaler.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(scaler.mean_, scaler_w.mean_) assert_almost_equal(scaler.var_, scaler_w.var_) assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test)) def test_standard_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_almost_equal(scaler.mean_, X.ravel()) assert_almost_equal(scaler.scale_, np.ones(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features)) else: assert_almost_equal(scaler.mean_, X.mean()) assert_almost_equal(scaler.scale_, X.std()) assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_almost_equal(scaler.mean_, 1.0) assert_almost_equal(scaler.scale_, 1.0) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 0.0) assert scaler.n_samples_seen_ == X.shape[0] @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) @pytest.mark.parametrize("add_sample_weight", [False, True]) def test_standard_scaler_dtype(add_sample_weight, sparse_container): # Ensure scaling does not affect dtype rng = np.random.RandomState(0) n_samples = 10 n_features = 3 if add_sample_weight: sample_weight = np.ones(n_samples) else: sample_weight = None with_mean = True if sparse_container is not None: # scipy sparse containers do not support float16, see # https://github.com/scipy/scipy/issues/7408 for more details. supported_dtype = [np.float64, np.float32] else: supported_dtype = [np.float64, np.float32, np.float16] for dtype in supported_dtype: X = rng.randn(n_samples, n_features).astype(dtype) if sparse_container is not None: X = sparse_container(X) with_mean = False scaler = StandardScaler(with_mean=with_mean) X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X) assert X.dtype == X_scaled.dtype assert scaler.mean_.dtype == np.float64 assert scaler.scale_.dtype == np.float64 @pytest.mark.parametrize( "scaler", [ StandardScaler(with_mean=False), RobustScaler(with_centering=False), ], ) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) @pytest.mark.parametrize("add_sample_weight", [False, True]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("constant", [0, 1.0, 100.0]) def test_standard_scaler_constant_features( scaler, add_sample_weight, sparse_container, dtype, constant ): if isinstance(scaler, RobustScaler) and add_sample_weight: pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight") rng = np.random.RandomState(0) n_samples = 100 n_features = 1 if add_sample_weight: fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2) else: fit_params = {} X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype) X = X_array if sparse_container is None else sparse_container(X_array) X_scaled = scaler.fit(X, **fit_params).transform(X) if isinstance(scaler, StandardScaler): # The variance info should be close to zero for constant features. assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7) # Constant features should not be scaled (scale of 1.): assert_allclose(scaler.scale_, np.ones(X.shape[1])) assert X_scaled is not X # make sure we make a copy assert_allclose_dense_sparse(X_scaled, X) if isinstance(scaler, StandardScaler) and not add_sample_weight: # Also check consistency with the standard scale function. X_scaled_2 = scale(X, with_mean=scaler.with_mean) assert X_scaled_2 is not X # make sure we did a copy assert_allclose_dense_sparse(X_scaled_2, X) @pytest.mark.parametrize("n_samples", [10, 100, 10_000]) @pytest.mark.parametrize("average", [1e-10, 1, 1e10]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) def test_standard_scaler_near_constant_features( n_samples, sparse_container, average, dtype ): # Check that when the variance is too small (var << mean**2) the feature # is considered constant and not scaled. scale_min, scale_max = -30, 19 scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype) n_features = scales.shape[0] X = np.empty((n_samples, n_features), dtype=dtype) # Make a dataset of known var = scales**2 and mean = average X[: n_samples // 2, :] = average + scales X[n_samples // 2 :, :] = average - scales X_array = X if sparse_container is None else sparse_container(X) scaler = StandardScaler(with_mean=False).fit(X_array) # StandardScaler uses float64 accumulators even if the data has a float32 # dtype. eps = np.finfo(np.float64).eps # if var < bound = N.eps.var + N².eps².mean², the feature is considered # constant and the scale_ attribute is set to 1. bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2 within_bounds = scales**2 <= bounds # Check that scale_min is small enough to have some scales below the # bound and therefore detected as constant: assert np.any(within_bounds) # Check that such features are actually treated as constant by the scaler: assert all(scaler.var_[within_bounds] <= bounds[within_bounds]) assert_allclose(scaler.scale_[within_bounds], 1.0) # Depending the on the dtype of X, some features might not actually be # representable as non constant for small scales (even if above the # precision bound of the float64 variance estimate). Such feature should # be correctly detected as constants with 0 variance by StandardScaler. representable_diff = X[0, :] - X[-1, :] != 0 assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0) assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1) # The other features are scaled and scale_ is equal to sqrt(var_) assuming # that scales are large enough for average + scale and average - scale to # be distinct in X (depending on X's dtype). common_mask = np.logical_and(scales**2 > bounds, representable_diff) assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask]) def test_scale_1d(): # 1-d inputs X_list = [1.0, 3.0, 5.0, 0.0] X_arr = np.array(X_list) for X in [X_list, X_arr]: X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(), 0.0) assert_array_almost_equal(X_scaled.std(), 1.0) assert_array_equal(scale(X, with_mean=False, with_std=False), X) @skip_if_32bit def test_standard_scaler_numerical_stability(): # Test numerical stability of scaling # np.log(1e-5) is taken because of its floating point representation # was empirically found to cause numerical problems with np.mean & np.std. x = np.full(8, np.log(1e-5), dtype=np.float64) # This does not raise a warning as the number of samples is too low # to trigger the problem in recent numpy with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) scale(x) assert_array_almost_equal(scale(x), np.zeros(8)) # with 2 more samples, the std computation run into numerical issues: x = np.full(10, np.log(1e-5), dtype=np.float64) warning_message = "standard deviation of the data is probably very close to 0" with pytest.warns(UserWarning, match=warning_message): x_scaled = scale(x) assert_array_almost_equal(x_scaled, np.zeros(10)) x = np.full(10, 1e-100, dtype=np.float64) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) x_small_scaled = scale(x) assert_array_almost_equal(x_small_scaled, np.zeros(10)) # Large values can cause (often recoverable) numerical stability issues: x_big = np.full(10, 1e100, dtype=np.float64) warning_message = "Dataset may contain too large values" with pytest.warns(UserWarning, match=warning_message): x_big_scaled = scale(x_big) assert_array_almost_equal(x_big_scaled, np.zeros(10)) assert_array_almost_equal(x_big_scaled, x_small_scaled) with pytest.warns(UserWarning, match=warning_message): x_big_centered = scale(x_big, with_std=False) assert_array_almost_equal(x_big_centered, np.zeros(10)) assert_array_almost_equal(x_big_centered, x_small_scaled) def test_scaler_2d_arrays(): # Test scaling of 2d array along first axis rng = np.random.RandomState(0) n_features = 5 n_samples = 4 X = rng.randn(n_samples, n_features) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert scaler.n_samples_seen_ == n_samples assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has been copied assert X_scaled is not X # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0]) # Check that the data hasn't been modified assert X_scaled is not X X_scaled = scaler.fit(X).transform(X, copy=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has not been copied assert X_scaled is X X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has not been copied assert X_scaled is not X def test_scaler_float16_overflow(): # Test if the scaler will not overflow on float16 numpy arrays rng = np.random.RandomState(0) # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000 # which is enough to overflow the data type X = rng.uniform(5, 10, [200000, 1]).astype(np.float16) with np.errstate(over="raise"): scaler = StandardScaler().fit(X) X_scaled = scaler.transform(X) # Calculate the float64 equivalent to verify result X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64)) # Overflow calculations may cause -inf, inf, or nan. Since there is no nan # input, all of the outputs should be finite. This may be redundant since a # FloatingPointError exception will be thrown on overflow above. assert np.all(np.isfinite(X_scaled)) # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are # checked to account for precision differences. assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2) def test_handle_zeros_in_scale(): s1 = np.array([0, 1e-16, 1, 2, 3]) s2 = _handle_zeros_in_scale(s1, copy=True) assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3])) assert_allclose(s2, np.array([1, 1, 1, 2, 3])) def test_minmax_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MinMaxScaler().fit(X[batch0]) scaler_incr = MinMaxScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std until the end of partial fits, and scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr( i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_, ) def test_standard_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = StandardScaler(with_std=False).fit(X) scaler_incr = StandardScaler(with_std=False) for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert scaler_batch.var_ == scaler_incr.var_ # Nones assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ # Test std after 1 step batch0 = slice(0, chunk_size) scaler_incr = StandardScaler().partial_fit(X[batch0]) if chunk_size == 1: assert_array_almost_equal( np.zeros(n_features, dtype=np.float64), scaler_incr.var_ ) assert_array_almost_equal( np.ones(n_features, dtype=np.float64), scaler_incr.scale_ ) else: assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) assert_array_almost_equal( np.std(X[batch0], axis=0), scaler_incr.scale_ ) # no constants # Test std until the end of partial fits, and scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr( i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_, ) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_standard_scaler_partial_fit_numerical_stability(sparse_container): # Test if the incremental computation introduces significative errors # for large datasets with values of large magniture rng = np.random.RandomState(0) n_features = 2 n_samples = 100 offsets = rng.uniform(-1e15, 1e15, size=n_features) scales = rng.uniform(1e3, 1e6, size=n_features) X = rng.randn(n_samples, n_features) * scales + offsets scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() for chunk in X: scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features)) # Regardless of abs values, they must not be more diff 6 significant digits tol = 10 ** (-6) assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol) assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol) assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol) # NOTE Be aware that for much larger offsets std is very unstable (last # assert) while mean is OK. # Sparse input size = (100, 3) scale = 1e20 X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale) # with_mean=False is required with sparse input scaler = StandardScaler(with_mean=False).fit(X) scaler_incr = StandardScaler(with_mean=False) for chunk in X: if chunk.ndim == 1: # Sparse arrays can be 1D (in scipy 1.14 and later) while old # sparse matrix instances are always 2D. chunk = chunk.reshape(1, -1) scaler_incr = scaler_incr.partial_fit(chunk) # Regardless of magnitude, they must not differ more than of 6 digits tol = 10 ** (-6) assert scaler.mean_ is not None assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol) assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol) @pytest.mark.parametrize("sample_weight", [True, None]) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_partial_fit_sparse_input(sample_weight, sparse_container): # Check that sparsity is not destroyed X = sparse_container(np.array([[1.0], [0.0], [0.0], [5.0]])) if sample_weight: sample_weight = rng.rand(X.shape[0]) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X) assert_array_equal(X_null.toarray(), X.toarray()) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.toarray(), X_null.toarray()) assert_array_equal(X_orig.toarray(), X.toarray()) @pytest.mark.parametrize("sample_weight", [True, None]) def test_standard_scaler_trasform_with_partial_fit(sample_weight): # Check some postconditions after applying partial_fit and transform X = X_2d[:100, :] if sample_weight: sample_weight = rng.rand(X.shape[0]) scaler_incr = StandardScaler() for i, batch in enumerate(gen_batches(X.shape[0], 1)): X_sofar = X[: (i + 1), :] chunks_copy = X_sofar.copy() if sample_weight is None: scaled_batch = StandardScaler().fit_transform(X_sofar) scaler_incr = scaler_incr.partial_fit(X[batch]) else: scaled_batch = StandardScaler().fit_transform( X_sofar, sample_weight=sample_weight[: i + 1] ) scaler_incr = scaler_incr.partial_fit( X[batch], sample_weight=sample_weight[batch] ) scaled_incr = scaler_incr.transform(X_sofar) assert_array_almost_equal(scaled_batch, scaled_incr) assert_array_almost_equal(X_sofar, chunks_copy) # No change right_input = scaler_incr.inverse_transform(scaled_incr) assert_array_almost_equal(X_sofar, right_input) zero = np.zeros(X.shape[1]) epsilon = np.finfo(float).eps assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) if sample_weight is None: # (i+1) because the Scaler has been already fitted assert (i + 1) == scaler_incr.n_samples_seen_ else: assert np.sum(sample_weight[: i + 1]) == pytest.approx( scaler_incr.n_samples_seen_ ) def test_standard_check_array_of_inverse_transform(): # Check if StandardScaler inverse_transform is # converting the integer array to float x = np.array( [ [1, 1, 1, 0, 1, 0], [1, 1, 1, 0, 1, 0], [0, 8, 0, 1, 0, 0], [1, 4, 1, 1, 0, 0], [0, 1, 0, 0, 1, 0], [0, 4, 0, 1, 0, 1], ], dtype=np.int32, ) scaler = StandardScaler() scaler.fit(x) # The of inverse_transform should be converted # to a float array. # If not X *= self.scale_ will fail. scaler.inverse_transform(x) @pytest.mark.parametrize( "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations(), ids=_get_namespace_device_dtype_ids, ) @pytest.mark.parametrize( "check", [check_array_api_input_and_values], ids=_get_check_estimator_ids, ) @pytest.mark.parametrize( "estimator", [ MaxAbsScaler(), MinMaxScaler(), MinMaxScaler(clip=True), KernelCenterer(), Normalizer(norm="l1"), Normalizer(norm="l2"), Normalizer(norm="max"), Binarizer(), ], ids=_get_check_estimator_ids, ) def test_preprocessing_array_api_compliance( estimator, check, array_namespace, device, dtype_name ): name = estimator.__class__.__name__ check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-0.5, 0.6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -0.5) assert_array_almost_equal(X_trans.max(axis=0), 0.6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) with pytest.raises(ValueError): scaler.fit(X) def test_min_max_scaler_zero_variance_features(): # Check min max scaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2) # function interface X_trans = minmax_scale(X) assert_array_almost_equal(X_trans, X_expected_0_1) X_trans = minmax_scale(X, feature_range=(1, 2)) assert_array_almost_equal(X_trans, X_expected_1_2) def test_minmax_scale_axis1(): X = iris.data X_trans = minmax_scale(X, axis=1) assert_array_almost_equal(np.min(X_trans, axis=1), 0) assert_array_almost_equal(np.max(X_trans, axis=1), 1) def test_min_max_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = MinMaxScaler(copy=True) X_scaled = scaler.fit(X).transform(X) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features)) assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features)) else: assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert X_scaled.min() >= 0.0 assert X_scaled.max() <= 1.0 assert scaler.n_samples_seen_ == X.shape[0] # Function interface X_1d = X_1row.ravel() min_ = X_1d.min() max_ = X_1d.max() assert_array_almost_equal( (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True) ) @pytest.mark.parametrize("sample_weight", [True, None]) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_without_centering(sample_weight, sparse_container): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_sparse = sparse_container(X) if sample_weight: sample_weight = rng.rand(X.shape[0]) with pytest.raises(ValueError): StandardScaler().fit(X_sparse) scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight) X_scaled = scaler.transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) scaler_sparse = StandardScaler(with_mean=False).fit( X_sparse, sample_weight=sample_weight ) X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) assert not np.any(np.isnan(X_sparse_scaled.data)) assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) assert_array_almost_equal(scaler.var_, scaler_sparse.var_) assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) assert_array_almost_equal(scaler.n_samples_seen_, scaler_sparse.n_samples_seen_) if sample_weight is None: assert_array_almost_equal( X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 ) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) X_sparse_scaled_mean, X_sparse_scaled_var = mean_variance_axis(X_sparse_scaled, 0) assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_sparse_scaled_var, X_scaled.var(axis=0)) # Check that X has not been modified (copy) assert X_scaled is not X assert X_sparse_scaled is not X_sparse X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) assert X_sparse_scaled_back is not X_sparse assert X_sparse_scaled_back is not X_sparse_scaled assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) if sparse_container in CSR_CONTAINERS: null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.fit_transform(X_sparse) assert_array_equal(X_null.data, X_sparse.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_sparse.data) @pytest.mark.parametrize("with_mean", [True, False]) @pytest.mark.parametrize("with_std", [True, False]) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_n_samples_seen_with_nan(with_mean, with_std, sparse_container): X = np.array( [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64 ) if sparse_container is not None: X = sparse_container(X) if sparse.issparse(X) and with_mean: pytest.skip("'with_mean=True' cannot be used with sparse matrix.") transformer = StandardScaler(with_mean=with_mean, with_std=with_std) transformer.fit(X) assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2])) def _check_identity_scalers_attributes(scaler_1, scaler_2): assert scaler_1.mean_ is scaler_2.mean_ is None assert scaler_1.var_ is scaler_2.var_ is None assert scaler_1.scale_ is scaler_2.scale_ is None assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_ @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_return_identity(sparse_container): # test that the scaler return identity when with_mean and with_std are # False X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64) X_sparse = sparse_container(X_dense) transformer_dense = StandardScaler(with_mean=False, with_std=False) X_trans_dense = transformer_dense.fit_transform(X_dense) assert_allclose(X_trans_dense, X_dense) transformer_sparse = clone(transformer_dense) X_trans_sparse = transformer_sparse.fit_transform(X_sparse) assert_allclose_dense_sparse(X_trans_sparse, X_sparse) _check_identity_scalers_attributes(transformer_dense, transformer_sparse) transformer_dense.partial_fit(X_dense) transformer_sparse.partial_fit(X_sparse) _check_identity_scalers_attributes(transformer_dense, transformer_sparse) transformer_dense.fit(X_dense) transformer_sparse.fit(X_sparse) _check_identity_scalers_attributes(transformer_dense, transformer_sparse) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_int(sparse_container): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_sparse = sparse_container(X) with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) with warnings.catch_warnings(record=True): scaler_sparse = StandardScaler(with_mean=False).fit(X_sparse) X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) assert not np.any(np.isnan(X_sparse_scaled.data)) assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) assert_array_almost_equal(scaler.var_, scaler_sparse.var_) assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) assert_array_almost_equal( X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2 ) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) X_sparse_scaled_mean, X_sparse_scaled_std = mean_variance_axis( X_sparse_scaled.astype(float), 0 ) assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_sparse_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert X_scaled is not X assert X_sparse_scaled is not X_sparse X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) assert X_sparse_scaled_back is not X_sparse assert X_sparse_scaled_back is not X_sparse_scaled assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) if sparse_container in CSR_CONTAINERS: null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_sparse) assert_array_equal(X_null.data, X_sparse.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_sparse.data) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_scaler_without_copy(sparse_container): # Check that StandardScaler.fit does not change input rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_sparse = sparse_container(X) X_copy = X.copy() StandardScaler(copy=False).fit(X) assert_array_equal(X, X_copy) X_sparse_copy = X_sparse.copy() StandardScaler(with_mean=False, copy=False).fit(X_sparse) assert_array_equal(X_sparse.toarray(), X_sparse_copy.toarray()) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_scale_sparse_with_mean_raise_exception(sparse_container): rng = np.random.RandomState(42) X = rng.randn(4, 5) X_sparse = sparse_container(X) # check scaling and fit with direct calls on sparse data with pytest.raises(ValueError): scale(X_sparse, with_mean=True) with pytest.raises(ValueError): StandardScaler(with_mean=True).fit(X_sparse) # check transform and inverse_transform after a fit on a dense array scaler = StandardScaler(with_mean=True).fit(X) with pytest.raises(ValueError): scaler.transform(X_sparse) X_transformed_sparse = sparse_container(scaler.transform(X)) with pytest.raises(ValueError): scaler.inverse_transform(X_transformed_sparse) def test_scale_input_finiteness_validation(): # Check if non finite inputs raise ValueError X = [[np.inf, 5, 6, 7, 8]] with pytest.raises( ValueError, match="Input contains infinity or a value too large" ): scale(X) def test_robust_scaler_error_sparse(): X_sparse = sparse.rand(1000, 10) scaler = RobustScaler(with_centering=True) err_msg = "Cannot center sparse matrices" with pytest.raises(ValueError, match=err_msg): scaler.fit(X_sparse) @pytest.mark.parametrize("with_centering", [True, False]) @pytest.mark.parametrize("with_scaling", [True, False]) @pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)]) def test_robust_scaler_attributes(X, with_centering, with_scaling): # check consistent type of attributes if with_centering and sparse.issparse(X): pytest.skip("RobustScaler cannot center sparse matrix") scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling) scaler.fit(X) if with_centering: assert isinstance(scaler.center_, np.ndarray) else: assert scaler.center_ is None if with_scaling: assert isinstance(scaler.scale_, np.ndarray) else: assert scaler.scale_ is None @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_robust_scaler_col_zero_sparse(csr_container): # check that the scaler is working when there is not data materialized in a # column of a sparse matrix X = np.random.randn(10, 5) X[:, 0] = 0 X = csr_container(X) scaler = RobustScaler(with_centering=False) scaler.fit(X) assert scaler.scale_[0] == pytest.approx(1) X_trans = scaler.transform(X) assert_allclose(X[:, [0]].toarray(), X_trans[:, [0]].toarray()) def test_robust_scaler_2d_arrays(): # Test robust scaling of 2d array along first axis rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = RobustScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0)[0], 0) @pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1]) @pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None]) def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed): # Check the equivalence of the fitting with dense and sparse matrices X_sparse = sparse.rand(1000, 5, density=density).tocsc() if strictly_signed == "positive": X_sparse.data = np.abs(X_sparse.data) elif strictly_signed == "negative": X_sparse.data = -np.abs(X_sparse.data) elif strictly_signed == "zeros": X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64) X_dense = X_sparse.toarray() scaler_sparse = RobustScaler(with_centering=False) scaler_dense = RobustScaler(with_centering=False) scaler_sparse.fit(X_sparse) scaler_dense.fit(X_dense) assert_allclose(scaler_sparse.scale_, scaler_dense.scale_) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_robust_scaler_transform_one_row_csr(csr_container): # Check RobustScaler on transforming csr matrix with one row rng = np.random.RandomState(0) X = rng.randn(4, 5) single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]]) scaler = RobustScaler(with_centering=False) scaler = scaler.fit(X) row_trans = scaler.transform(csr_container(single_row)) row_expected = single_row / scaler.scale_ assert_array_almost_equal(row_trans.toarray(), row_expected) row_scaled_back = scaler.inverse_transform(row_trans) assert_array_almost_equal(single_row, row_scaled_back.toarray()) def test_robust_scaler_iris(): X = iris.data scaler = RobustScaler() X_trans = scaler.fit_transform(X) assert_array_almost_equal(np.median(X_trans, axis=0), 0) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) q = np.percentile(X_trans, q=(25, 75), axis=0) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scaler_iris_quantiles(): X = iris.data scaler = RobustScaler(quantile_range=(10, 90)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(np.median(X_trans, axis=0), 0) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) q = np.percentile(X_trans, q=(10, 90), axis=0) q_range = q[1] - q[0] assert_array_almost_equal(q_range, 1) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_iris(csc_container): X = iris.data # uniform output distribution transformer = QuantileTransformer(n_quantiles=30) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # normal output distribution transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal") X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure it is possible to take the inverse of a sparse matrix # which contain negative value; this is the case in the iris dataset X_sparse = csc_container(X) X_sparse_tran = transformer.fit_transform(X_sparse) X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran) assert_array_almost_equal(X_sparse.toarray(), X_sparse_tran_inv.toarray()) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_check_error(csc_container): X = np.transpose( [ [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], ] ) X = csc_container(X) X_neg = np.transpose( [ [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], ] ) X_neg = csc_container(X_neg) err_msg = ( "The number of quantiles cannot be greater than " "the number of samples used. Got 1000 quantiles " "and 10 samples." ) with pytest.raises(ValueError, match=err_msg): QuantileTransformer(subsample=10).fit(X) transformer = QuantileTransformer(n_quantiles=10) err_msg = "QuantileTransformer only accepts non-negative sparse matrices." with pytest.raises(ValueError, match=err_msg): transformer.fit(X_neg) transformer.fit(X) err_msg = "QuantileTransformer only accepts non-negative sparse matrices." with pytest.raises(ValueError, match=err_msg): transformer.transform(X_neg) X_bad_feat = np.transpose( [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]] ) err_msg = ( "X has 2 features, but QuantileTransformer is expecting 3 features as input." ) with pytest.raises(ValueError, match=err_msg): transformer.inverse_transform(X_bad_feat) transformer = QuantileTransformer(n_quantiles=10).fit(X) # check that an error is raised if input is scalar with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"): transformer.transform(10) # check that a warning is raised is n_quantiles > n_samples transformer = QuantileTransformer(n_quantiles=100) warn_msg = "n_quantiles is set to n_samples" with pytest.warns(UserWarning, match=warn_msg) as record: transformer.fit(X) assert len(record) == 1 assert transformer.n_quantiles_ == X.shape[0] @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_sparse_ignore_zeros(csc_container): X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]]) X_sparse = csc_container(X) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) # dense case -> warning raise warning_message = ( "'ignore_implicit_zeros' takes effect" " only with sparse matrix. This parameter has no" " effect." ) with pytest.warns(UserWarning, match=warning_message): transformer.fit(X) X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]]) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.toarray()) # consider the case where sparse entries are missing values and user-given # zeros are to be considered X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) X_sparse = csc_container((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) X_expected = np.array( [ [0.0, 0.5], [0.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 0.5], [0.0, 0.0], [0.0, 0.5], [0.0, 1.0], [0.0, 0.0], ] ) assert_almost_equal(X_expected, X_trans.toarray()) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) X_sparse = csc_container((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) X_expected = np.array( [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]] ) assert_almost_equal(X_expected, X_trans.toarray()) assert_almost_equal( X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() ) # check in conjunction with subsampling transformer = QuantileTransformer( ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0 ) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.toarray()) assert_almost_equal( X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() ) def test_quantile_transform_dense_toy(): X = np.array( [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]] ) transformer = QuantileTransformer(n_quantiles=5) transformer.fit(X) # using a uniform output, each entry of X should be map between 0 and 1 # and equally spaced X_trans = transformer.fit_transform(X) X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T assert_almost_equal(np.sort(X_trans, axis=0), X_expected) X_test = np.array( [ [-1, 1, 0], [101, 11, 10], ] ) X_expected = np.array( [ [0, 0, 0], [1, 1, 1], ] ) assert_array_almost_equal(transformer.transform(X_test), X_expected) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) def test_quantile_transform_subsampling(): # Test that subsampling the input yield to a consistent results We check # that the computed quantiles are almost mapped to a [0, 1] vector where # values are equally spaced. The infinite norm is checked to be smaller # than a given threshold. This is repeated 5 times. # dense support n_samples = 1000000 n_quantiles = 1000 X = np.sort(np.random.sample((n_samples, 1)), axis=0) ROUND = 5 inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer( random_state=random_state, n_quantiles=n_quantiles, subsample=n_samples // 10, ) transformer.fit(X) diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) inf_norm = np.max(np.abs(diff)) assert inf_norm < 1e-2 inf_norm_arr.append(inf_norm) # each random subsampling yield a unique approximation to the expected # linspace CDF assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) # sparse support X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0) inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer( random_state=random_state, n_quantiles=n_quantiles, subsample=n_samples // 10, ) transformer.fit(X) diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) inf_norm = np.max(np.abs(diff)) assert inf_norm < 1e-1 inf_norm_arr.append(inf_norm) # each random subsampling yield a unique approximation to the expected # linspace CDF assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) def test_quantile_transform_subsampling_disabled(): """Check the behaviour of `QuantileTransformer` when `subsample=None`.""" X = np.random.RandomState(0).normal(size=(200, 1)) n_quantiles = 5 transformer = QuantileTransformer(n_quantiles=n_quantiles, subsample=None).fit(X) expected_references = np.linspace(0, 1, n_quantiles) assert_allclose(transformer.references_, expected_references) expected_quantiles = np.quantile(X.ravel(), expected_references) assert_allclose(transformer.quantiles_.ravel(), expected_quantiles) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_sparse_toy(csc_container): X = np.array( [ [0.0, 2.0, 0.0], [25.0, 4.0, 0.0], [50.0, 0.0, 2.6], [0.0, 0.0, 4.1], [0.0, 6.0, 0.0], [0.0, 8.0, 0.0], [75.0, 0.0, 2.3], [0.0, 10.0, 0.0], [0.0, 0.0, 9.5], [100.0, 0.0, 0.1], ] ) X = csc_container(X) transformer = QuantileTransformer(n_quantiles=10) transformer.fit(X) X_trans = transformer.fit_transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray()) X_trans = transformer_dense.transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) X_trans_inv = transformer_dense.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) def test_quantile_transform_axis1(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5) X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5) assert_array_almost_equal(X_trans_a0, X_trans_a1.T) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_bounds(csc_container): # Lower and upper bounds are manually mapped. We checked that in the case # of a constant feature and binary feature, the bounds are properly mapped. X_dense = np.array([[0, 0], [0, 0], [1, 0]]) X_sparse = csc_container(X_dense) # check sparse and dense are consistent X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform( X_sparse ) assert_array_almost_equal(X_trans_sp.toarray(), X_dense) assert_array_almost_equal(X_trans, X_trans_sp.toarray()) # check the consistency of the bounds by learning on 1 matrix # and transforming another X = np.array([[0, 1], [0, 0.5], [1, 0]]) X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]]) transformer = QuantileTransformer(n_quantiles=3).fit(X) X_trans = transformer.transform(X1) assert_array_almost_equal(X_trans, X1) # check that values outside of the range learned will be mapped properly. X = np.random.random((1000, 1)) transformer = QuantileTransformer() transformer.fit(X) assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]]) assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]]) assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform( [[np.min(transformer.references_)]] ) assert transformer.inverse_transform([[10]]) == transformer.inverse_transform( [[np.max(transformer.references_)]] ) def test_quantile_transform_and_inverse(): X_1 = iris.data X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]]) for X in [X_1, X_2]: transformer = QuantileTransformer(n_quantiles=1000, random_state=0) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv, decimal=9) def test_quantile_transform_nan(): X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]]) transformer = QuantileTransformer(n_quantiles=10, random_state=42) transformer.fit_transform(X) # check that the quantile of the first column is all NaN assert np.isnan(transformer.quantiles_[:, 0]).all() # all other column should not contain NaN assert not np.isnan(transformer.quantiles_[:, 1:]).any() @pytest.mark.parametrize("array_type", ["array", "sparse"]) def test_quantile_transformer_sorted_quantiles(array_type): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15733 # Taken from upstream bug report: # https://github.com/numpy/numpy/issues/14685 X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) X = 0.1 * X.reshape(-1, 1) X = _convert_container(X, array_type) n_quantiles = 100 qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) # Check that the estimated quantile thresholds are monotically # increasing: quantiles = qt.quantiles_[:, 0] assert len(quantiles) == 100 assert all(np.diff(quantiles) >= 0) def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), (-2, -3), (10, 101), (100.5, 101), (90, 50), ]: scaler = RobustScaler(quantile_range=range_) with pytest.raises(ValueError, match=r"Invalid quantile range: \("): scaler.fit(iris.data) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_scale_function_without_centering(csr_container): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = csr_container(X) X_scaled = scale(X, with_mean=False) assert not np.any(np.isnan(X_scaled)) X_csr_scaled = scale(X_csr, with_mean=False) assert not np.any(np.isnan(X_csr_scaled.data)) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 with pytest.raises(ValueError): scale(X_csr, with_mean=False, axis=1) assert_array_almost_equal( X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 ) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has not been copied assert X_scaled is not X X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # null scale X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True) assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray()) def test_robust_scale_axis1(): X = iris.data X_trans = robust_scale(X, axis=1) assert_array_almost_equal(np.median(X_trans, axis=1), 0) q = np.percentile(X_trans, q=(25, 75), axis=1) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scale_1d_array(): X = iris.data[:, 1] X_trans = robust_scale(X) assert_array_almost_equal(np.median(X_trans), 0) q = np.percentile(X_trans, q=(25, 75)) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scaler_zero_variance_features(): # Check RobustScaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] scaler = RobustScaler() X_trans = scaler.fit_transform(X) # NOTE: for such a small sample size, what we expect in the third column # depends HEAVILY on the method used to calculate quantiles. The values # here were calculated to fit the quantiles produces by np.percentile # using numpy 1.9 Calculating quantiles with # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles # would yield very different results! X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]] assert_array_almost_equal(X_trans, X_expected) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure new data gets transformed correctly X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] X_trans_new = scaler.transform(X_new) X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]] assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3) def test_robust_scaler_unit_variance(): # Check RobustScaler with unit_variance=True on standard normal data with # outliers rng = np.random.RandomState(42) X = rng.randn(1000000, 1) X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100]) quantile_range = (1, 99) robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit( X_with_outliers ) X_trans = robust_scaler.transform(X) assert robust_scaler.center_ == pytest.approx(0, abs=1e-3) assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2) assert X_trans.std() == pytest.approx(1, abs=1e-2) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_maxabs_scaler_zero_variance_features(sparse_container): # Check MaxAbsScaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]] scaler = MaxAbsScaler() X_trans = scaler.fit_transform(X) X_expected = [ [0.0, 1.0, 1.0 / 3.0], [0.0, 1.0, -0.2], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], ] assert_array_almost_equal(X_trans, X_expected) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure new data gets transformed correctly X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] X_trans_new = scaler.transform(X_new) X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]] assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2) # function interface X_trans = maxabs_scale(X) assert_array_almost_equal(X_trans, X_expected) # sparse data X_sparse = sparse_container(X) X_trans_sparse = scaler.fit_transform(X_sparse) X_expected = [ [0.0, 1.0, 1.0 / 3.0], [0.0, 1.0, -0.2], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], ] assert_array_almost_equal(X_trans_sparse.toarray(), X_expected) X_trans_sparse_inv = scaler.inverse_transform(X_trans_sparse) assert_array_almost_equal(X, X_trans_sparse_inv.toarray()) def test_maxabs_scaler_large_negative_value(): # Check MaxAbsScaler on toy data with a large negative value X = [ [0.0, 1.0, +0.5, -1.0], [0.0, 1.0, -0.3, -0.5], [0.0, 1.0, -100.0, 0.0], [0.0, 0.0, +0.0, -2.0], ] scaler = MaxAbsScaler() X_trans = scaler.fit_transform(X) X_expected = [ [0.0, 1.0, 0.005, -0.5], [0.0, 1.0, -0.003, -0.25], [0.0, 1.0, -1.0, 0.0], [0.0, 0.0, 0.0, -1.0], ] assert_array_almost_equal(X_trans, X_expected) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_maxabs_scaler_transform_one_row_csr(csr_container): # Check MaxAbsScaler on transforming csr matrix with one row X = csr_container([[0.5, 1.0, 1.0]]) scaler = MaxAbsScaler() scaler = scaler.fit(X) X_trans = scaler.transform(X) X_expected = csr_container([[1.0, 1.0, 1.0]]) assert_array_almost_equal(X_trans.toarray(), X_expected.toarray()) X_scaled_back = scaler.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_scaled_back.toarray()) def test_maxabs_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = MaxAbsScaler(copy=True) X_scaled = scaler.fit(X).transform(X) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features)) else: assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = MaxAbsScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # function interface X_1d = X_1row.ravel() max_abs = np.abs(X_1d).max() assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True)) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_maxabs_scaler_partial_fit(csr_container): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d[:100, :] n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() scaler_incr_csr = MaxAbsScaler() scaler_incr_csc = MaxAbsScaler() for batch in gen_batches(n, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) X_csr = csr_container(X[batch]) scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) X_csc = csr_container(X[batch]) scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_ assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_ assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MaxAbsScaler().fit(X[batch0]) scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std until the end of partial fits, and scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() # Clean estimator for i, batch in enumerate(gen_batches(n, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr( i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_, ) def check_normalizer(norm, X_norm): """ Convenient checking function for `test_normalizer_l1_l2_max` and `test_normalizer_l1_l2_max_non_csr` """ if norm == "l1": row_sums = np.abs(X_norm).sum(axis=1) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(row_sums[3], 0.0) elif norm == "l2": for i in range(3): assert_almost_equal(la.norm(X_norm[i]), 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0) elif norm == "max": row_maxs = abs(X_norm).max(axis=1) for i in range(3): assert_almost_equal(row_maxs[i], 1.0) assert_almost_equal(row_maxs[3], 0.0) @pytest.mark.parametrize("norm", ["l1", "l2", "max"]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_normalizer_l1_l2_max(norm, csr_container): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = csr_container(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = csr_container(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm=norm, copy=True) X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm=norm, copy=False) X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) for X_norm in (X_norm1, X_norm2): check_normalizer(norm, X_norm) @pytest.mark.parametrize("norm", ["l1", "l2", "max"]) @pytest.mark.parametrize( "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + LIL_CONTAINERS ) def test_normalizer_l1_l2_max_non_csr(norm, sparse_container): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) # set the row number 3 to zero X_dense[3, :] = 0.0 X = sparse_container(X_dense) X_norm = Normalizer(norm=norm, copy=False).transform(X) assert X_norm is not X assert sparse.issparse(X_norm) and X_norm.format == "csr" X_norm = toarray(X_norm) check_normalizer(norm, X_norm) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_normalizer_max_sign(csr_container): # check that we normalize by a positive number even for negative data rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) # set the row number 3 to zero X_dense[3, :] = 0.0 # check for mixed data where the value with # largest magnitude is negative X_dense[2, abs(X_dense[2, :]).argmax()] *= -1 X_all_neg = -np.abs(X_dense) X_all_neg_sparse = csr_container(X_all_neg) for X in (X_dense, X_all_neg, X_all_neg_sparse): normalizer = Normalizer(norm="max") X_norm = normalizer.transform(X) assert X_norm is not X X_norm = toarray(X_norm) assert_array_equal(np.sign(X_norm), np.sign(toarray(X))) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_normalize(csr_container): # Test normalize function # Only tests functionality not used by the tests for Normalizer. X = np.random.RandomState(37).randn(3, 2) assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T) rs = np.random.RandomState(0) X_dense = rs.randn(10, 5) X_sparse = csr_container(X_dense) ones = np.ones((10)) for X in (X_dense, X_sparse): for dtype in (np.float32, np.float64): for norm in ("l1", "l2"): X = X.astype(dtype) X_norm = normalize(X, norm=norm) assert X_norm.dtype == dtype X_norm = toarray(X_norm) if norm == "l1": row_sums = np.abs(X_norm).sum(axis=1) else: X_norm_squared = X_norm**2 row_sums = X_norm_squared.sum(axis=1) assert_array_almost_equal(row_sums, ones) # Test return_norm X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]]) for norm in ("l1", "l2", "max"): _, norms = normalize(X_dense, norm=norm, return_norm=True) if norm == "l1": assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0])) elif norm == "l2": assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127])) else: assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) X_sparse = csr_container(X_dense) for norm in ("l1", "l2"): with pytest.raises(NotImplementedError): normalize(X_sparse, norm=norm, return_norm=True) _, norms = normalize(X_sparse, norm="max", return_norm=True) assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) @pytest.mark.parametrize( "constructor", [np.array, list] + CSC_CONTAINERS + CSR_CONTAINERS ) def test_binarizer(constructor): X_ = np.array([[1, 0, 5], [2, 3, -1]]) X = constructor(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 4 assert np.sum(X_bin == 1) == 2 X_bin = binarizer.transform(X) assert sparse.issparse(X) == sparse.issparse(X_bin) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert X_bin is not X assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert X_bin is not X X_bin = toarray(X_bin) assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) if constructor is not list: assert X_bin is X binarizer = Binarizer(copy=False) X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64) X_bin = binarizer.transform(X_float) if constructor is not list: assert X_bin is X_float X_bin = toarray(X_bin) assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(threshold=-0.5, copy=True) if constructor in (np.array, list): X = constructor(X_.copy()) X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 1 assert np.sum(X_bin == 1) == 5 X_bin = binarizer.transform(X) # Cannot use threshold < 0 for sparse if constructor in CSC_CONTAINERS: with pytest.raises(ValueError): binarizer.transform(constructor(X)) @pytest.mark.parametrize( "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() ) def test_binarizer_array_api_int(array_namespace, device, dtype_name): # Checks that Binarizer works with integer elements and float threshold xp = _array_api_for_tests(array_namespace, device) for dtype_name_ in [dtype_name, "int32", "int64"]: X_np = np.reshape(np.asarray([0, 1, 2, 3, 4], dtype=dtype_name_), (-1, 1)) X_xp = xp.asarray(X_np, device=device) binarized_np = Binarizer(threshold=2.5).fit_transform(X_np) with config_context(array_api_dispatch=True): binarized_xp = Binarizer(threshold=2.5).fit_transform(X_xp) assert_array_equal(_convert_to_numpy(binarized_xp, xp), binarized_np) def test_center_kernel(): # Test that KernelCenterer is equivalent to StandardScaler # in feature space rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) scaler = StandardScaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = rng.random_sample((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2) # check the results coherence with the method proposed in: # B. Schölkopf, A. Smola, and K.R. Müller, # "Nonlinear component analysis as a kernel eigenvalue problem" # equation (B.3) # K_centered3 = (I - 1_M) K (I - 1_M) # = K - 1_M K - K 1_M + 1_M K 1_M ones_M = np.ones_like(K_fit) / K_fit.shape[0] K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M assert_allclose(K_fit_centered, K_fit_centered3) # K_test_centered3 = (K_test - 1'_M K)(I - 1_M) # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0] K_pred_centered3 = ( K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M ) assert_allclose(K_pred_centered, K_pred_centered3) def test_kernelcenterer_non_linear_kernel(): """Check kernel centering for non-linear kernel.""" rng = np.random.RandomState(0) X, X_test = rng.randn(100, 50), rng.randn(20, 50) def phi(X): """Our mapping function phi.""" return np.vstack( [ np.clip(X, a_min=0, a_max=None), -np.clip(X, a_min=None, a_max=0), ] ) phi_X = phi(X) phi_X_test = phi(X_test) # centered the projection scaler = StandardScaler(with_std=False) phi_X_center = scaler.fit_transform(phi_X) phi_X_test_center = scaler.transform(phi_X_test) # create the different kernel K = phi_X @ phi_X.T K_test = phi_X_test @ phi_X.T K_center = phi_X_center @ phi_X_center.T K_test_center = phi_X_test_center @ phi_X_center.T kernel_centerer = KernelCenterer() kernel_centerer.fit(K) assert_allclose(kernel_centerer.transform(K), K_center) assert_allclose(kernel_centerer.transform(K_test), K_test_center) # check the results coherence with the method proposed in: # B. Schölkopf, A. Smola, and K.R. Müller, # "Nonlinear component analysis as a kernel eigenvalue problem" # equation (B.3) # K_centered = (I - 1_M) K (I - 1_M) # = K - 1_M K - K 1_M + 1_M K 1_M ones_M = np.ones_like(K) / K.shape[0] K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M assert_allclose(kernel_centerer.transform(K), K_centered) # K_test_centered = (K_test - 1'_M K)(I - 1_M) # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M ones_prime_M = np.ones_like(K_test) / K.shape[0] K_test_centered = ( K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M ) assert_allclose(kernel_centerer.transform(K_test), K_test_centered) def test_cv_pipeline_precomputed(): # Cross-validate a regression on four coplanar points with the same # value. Use precomputed kernel to ensure Pipeline with KernelCenterer # is treated as a pairwise operation. X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]]) y_true = np.ones((4,)) K = X.dot(X.T) kcent = KernelCenterer() pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())]) # did the pipeline set the pairwise attribute? assert pipeline.__sklearn_tags__().input_tags.pairwise # test cross-validation, score should be almost perfect # NB: this test is pretty vacuous -- it's mainly to test integration # of Pipeline and KernelCenterer y_pred = cross_val_predict(pipeline, K, y_true, cv=2) assert_array_almost_equal(y_true, y_pred) def test_fit_transform(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for obj in (StandardScaler(), Normalizer(), Binarizer()): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2) def test_add_dummy_feature(): X = [[1, 0], [0, 1], [0, 1]] X = add_dummy_feature(X) assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) @pytest.mark.parametrize( "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS ) def test_add_dummy_feature_sparse(sparse_container): X = sparse_container([[1, 0], [0, 1], [0, 1]]) desired_format = X.format X = add_dummy_feature(X) assert sparse.issparse(X) and X.format == desired_format, X assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) def test_fit_cold_start(): X = iris.data X_2d = X[:, :2] # Scalers that have a partial_fit method scalers = [ StandardScaler(with_mean=False, with_std=False), MinMaxScaler(), MaxAbsScaler(), ] for scaler in scalers: scaler.fit_transform(X) # with a different shape, this may break the scaler unless the internal # state is reset scaler.fit_transform(X_2d) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) def test_power_transformer_notfitted(method): pt = PowerTransformer(method=method) X = np.abs(X_1col) with pytest.raises(NotFittedError): pt.transform(X) with pytest.raises(NotFittedError): pt.inverse_transform(X) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) @pytest.mark.parametrize("X", [X_1col, X_2d]) def test_power_transformer_inverse(method, standardize, X): # Make sure we get the original input when applying transform and then # inverse transform X = np.abs(X) if method == "box-cox" else X pt = PowerTransformer(method=method, standardize=standardize) X_trans = pt.fit_transform(X) assert_almost_equal(X, pt.inverse_transform(X_trans)) def test_power_transformer_1d(): X = np.abs(X_1col) for standardize in [True, False]: pt = PowerTransformer(method="box-cox", standardize=standardize) X_trans = pt.fit_transform(X) X_trans_func = power_transform(X, method="box-cox", standardize=standardize) X_expected, lambda_expected = stats.boxcox(X.flatten()) if standardize: X_expected = scale(X_expected) assert_almost_equal(X_expected.reshape(-1, 1), X_trans) assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func) assert_almost_equal(X, pt.inverse_transform(X_trans)) assert_almost_equal(lambda_expected, pt.lambdas_[0]) assert len(pt.lambdas_) == X.shape[1] assert isinstance(pt.lambdas_, np.ndarray) def test_power_transformer_2d(): X = np.abs(X_2d) for standardize in [True, False]: pt = PowerTransformer(method="box-cox", standardize=standardize) X_trans_class = pt.fit_transform(X) X_trans_func = power_transform(X, method="box-cox", standardize=standardize) for X_trans in [X_trans_class, X_trans_func]: for j in range(X_trans.shape[1]): X_expected, lmbda = stats.boxcox(X[:, j].flatten()) if standardize: X_expected = scale(X_expected) assert_almost_equal(X_trans[:, j], X_expected) assert_almost_equal(lmbda, pt.lambdas_[j]) # Test inverse transformation X_inv = pt.inverse_transform(X_trans) assert_array_almost_equal(X_inv, X) assert len(pt.lambdas_) == X.shape[1] assert isinstance(pt.lambdas_, np.ndarray) def test_power_transformer_boxcox_strictly_positive_exception(): # Exceptions should be raised for negative arrays and zero arrays when # method is boxcox pt = PowerTransformer(method="box-cox") pt.fit(np.abs(X_2d)) X_with_negatives = X_2d not_positive_message = "strictly positive" with pytest.raises(ValueError, match=not_positive_message): pt.transform(X_with_negatives) with pytest.raises(ValueError, match=not_positive_message): pt.fit(X_with_negatives) with pytest.raises(ValueError, match=not_positive_message): power_transform(X_with_negatives, method="box-cox") with pytest.raises(ValueError, match=not_positive_message): pt.transform(np.zeros(X_2d.shape)) with pytest.raises(ValueError, match=not_positive_message): pt.fit(np.zeros(X_2d.shape)) with pytest.raises(ValueError, match=not_positive_message): power_transform(np.zeros(X_2d.shape), method="box-cox") @pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)]) def test_power_transformer_yeojohnson_any_input(X): # Yeo-Johnson method should support any kind of input power_transform(X, method="yeo-johnson") @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) def test_power_transformer_shape_exception(method): pt = PowerTransformer(method=method) X = np.abs(X_2d) pt.fit(X) # Exceptions should be raised for arrays with different num_columns # than during fitting wrong_shape_message = ( r"X has \d+ features, but PowerTransformer is expecting \d+ features" ) with pytest.raises(ValueError, match=wrong_shape_message): pt.transform(X[:, 0:1]) with pytest.raises(ValueError, match=wrong_shape_message): pt.inverse_transform(X[:, 0:1]) def test_power_transformer_lambda_zero(): pt = PowerTransformer(method="box-cox", standardize=False) X = np.abs(X_2d)[:, 0:1] # Test the lambda = 0 case pt.lambdas_ = np.array([0]) X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) def test_power_transformer_lambda_one(): # Make sure lambda = 1 corresponds to the identity for yeo-johnson pt = PowerTransformer(method="yeo-johnson", standardize=False) X = np.abs(X_2d)[:, 0:1] pt.lambdas_ = np.array([1]) X_trans = pt.transform(X) assert_array_almost_equal(X_trans, X) @pytest.mark.parametrize( "method, lmbda", [ ("box-cox", 0.1), ("box-cox", 0.5), ("yeo-johnson", 0.1), ("yeo-johnson", 0.5), ("yeo-johnson", 1.0), ], ) def test_optimization_power_transformer(method, lmbda): # Test the optimization procedure: # - set a predefined value for lambda # - apply inverse_transform to a normal dist (we get X_inv) # - apply fit_transform to X_inv (we get X_inv_trans) # - check that X_inv_trans is roughly equal to X rng = np.random.RandomState(0) n_samples = 20000 X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) if method == "box-cox": # For box-cox, means that lmbda * y + 1 > 0 or y > - 1 / lmbda # Clip the data here to make sure the inequality is valid. X = np.clip(X, -1 / lmbda + 1e-5, None) pt = PowerTransformer(method=method, standardize=False) pt.lambdas_ = [lmbda] X_inv = pt.inverse_transform(X) pt = PowerTransformer(method=method, standardize=False) X_inv_trans = pt.fit_transform(X_inv) assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2) assert_almost_equal(0, X_inv_trans.mean(), decimal=1) assert_almost_equal(1, X_inv_trans.std(), decimal=1) def test_invserse_box_cox(): # output nan if the input is invalid pt = PowerTransformer(method="box-cox", standardize=False) pt.lambdas_ = [0.5] X_inv = pt.inverse_transform([[-2.1]]) assert np.isnan(X_inv) def test_yeo_johnson_darwin_example(): # test from original paper "A new family of power transformations to # improve normality or symmetry" by Yeo and Johnson. X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0] X = np.array(X).reshape(-1, 1) lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_ assert np.allclose(lmbda, 1.305, atol=1e-3) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) def test_power_transformer_nans(method): # Make sure lambda estimation is not influenced by NaN values # and that transform() supports NaN silently X = np.abs(X_1col) pt = PowerTransformer(method=method) pt.fit(X) lmbda_no_nans = pt.lambdas_[0] # concat nans at the end and check lambda stays the same X = np.concatenate([X, np.full_like(X, np.nan)]) X = shuffle(X, random_state=0) pt.fit(X) lmbda_nans = pt.lambdas_[0] assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) X_trans = pt.transform(X) assert_array_equal(np.isnan(X_trans), np.isnan(X)) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_fit_transform(method, standardize): # check that fit_transform() and fit().transform() return the same values X = X_1col if method == "box-cox": X = np.abs(X) pt = PowerTransformer(method, standardize=standardize) assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_copy_True(method, standardize): # Check that neither fit, transform, fit_transform nor inverse_transform # modify X inplace when copy=True X = X_1col if method == "box-cox": X = np.abs(X) X_original = X.copy() assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) pt = PowerTransformer(method, standardize=standardize, copy=True) pt.fit(X) assert_array_almost_equal(X, X_original) X_trans = pt.transform(X) assert X_trans is not X X_trans = pt.fit_transform(X) assert_array_almost_equal(X, X_original) assert X_trans is not X X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is not X_inv_trans @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_copy_False(method, standardize): # check that when copy=False fit doesn't change X inplace but transform, # fit_transform and inverse_transform do. X = X_1col if method == "box-cox": X = np.abs(X) X_original = X.copy() assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) pt = PowerTransformer(method, standardize=standardize, copy=False) pt.fit(X) assert_array_almost_equal(X, X_original) # fit didn't change X X_trans = pt.transform(X) assert X_trans is X if method == "box-cox": X = np.abs(X) X_trans = pt.fit_transform(X) assert X_trans is X X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is X_inv_trans def test_power_transformer_box_cox_raise_all_nans_col(): """Check that box-cox raises informative when a column contains all nans. Non-regression test for gh-26303 """ X = rng.random_sample((4, 5)) X[:, 0] = np.nan err_msg = "Column must not be all nan." pt = PowerTransformer(method="box-cox") with pytest.raises(ValueError, match=err_msg): pt.fit_transform(X) @pytest.mark.parametrize( "X_2", [sparse.random(10, 1, density=0.8, random_state=0)] + [ csr_container(np.full((10, 1), fill_value=np.nan)) for csr_container in CSR_CONTAINERS ], ) def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/16448 X_1 = sparse.random(5, 1, density=0.8) scaler = StandardScaler(with_mean=False) scaler.fit(X_1).partial_fit(X_2) assert np.isfinite(scaler.var_[0]) @pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)]) def test_minmax_scaler_clip(feature_range): # test behaviour of the parameter 'clip' in MinMaxScaler X = iris.data scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X) X_min, X_max = np.min(X, axis=0), np.max(X, axis=0) X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]] X_transformed = scaler.transform(X_test) assert_allclose( X_transformed, [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]], ) def test_standard_scaler_raise_error_for_1d_input(): """Check that `inverse_transform` from `StandardScaler` raises an error with 1D array. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/19518 """ scaler = StandardScaler().fit(X_2d) err_msg = "Expected 2D array, got 1D array instead" with pytest.raises(ValueError, match=err_msg): scaler.inverse_transform(X_2d[:, 0]) def test_power_transformer_significantly_non_gaussian(): """Check that significantly non-Gaussian data before transforms correctly. For some explored lambdas, the transformed data may be constant and will be rejected. Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/14959 """ X_non_gaussian = 1e6 * np.array( [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64 ).reshape(-1, 1) pt = PowerTransformer() with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) X_trans = pt.fit_transform(X_non_gaussian) assert not np.any(np.isnan(X_trans)) assert X_trans.mean() == pytest.approx(0.0) assert X_trans.std() == pytest.approx(1.0) assert X_trans.min() > -2 assert X_trans.max() < 2 @pytest.mark.parametrize( "Transformer", [ MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler, QuantileTransformer, PowerTransformer, ], ) def test_one_to_one_features(Transformer): """Check one-to-one transformers give correct feature names.""" tr = Transformer().fit(iris.data) names_out = tr.get_feature_names_out(iris.feature_names) assert_array_equal(names_out, iris.feature_names) @pytest.mark.parametrize( "Transformer", [ MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler, QuantileTransformer, PowerTransformer, Normalizer, Binarizer, ], ) def test_one_to_one_features_pandas(Transformer): """Check one-to-one transformers give correct feature names.""" pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) tr = Transformer().fit(df) names_out_df_default = tr.get_feature_names_out() assert_array_equal(names_out_df_default, iris.feature_names) names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names) assert_array_equal(names_out_df_valid_in, iris.feature_names) msg = re.escape("input_features is not equal to feature_names_in_") with pytest.raises(ValueError, match=msg): invalid_names = list("abcd") tr.get_feature_names_out(invalid_names) def test_kernel_centerer_feature_names_out(): """Test that kernel centerer `feature_names_out`.""" rng = np.random.RandomState(0) X = rng.random_sample((6, 4)) X_pairwise = linear_kernel(X) centerer = KernelCenterer().fit(X_pairwise) names_out = centerer.get_feature_names_out() samples_out2 = X_pairwise.shape[1] assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_constant_feature(standardize): """Check that PowerTransfomer leaves constant features unchanged.""" X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]] pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X) assert_allclose(pt.lambdas_, [1, 1, 1]) Xft = pt.fit_transform(X) Xt = pt.transform(X) for Xt_ in [Xft, Xt]: if standardize: assert_allclose(Xt_, np.zeros_like(X)) else: assert_allclose(Xt_, X) @pytest.mark.skipif( sp_version < parse_version("1.12"), reason="scipy version 1.12 required for stable yeo-johnson", ) def test_power_transformer_no_warnings(): """Verify that PowerTransformer operates without raising any warnings on valid data. This test addresses numerical issues with floating point numbers (mostly overflows) with the Yeo-Johnson transform, see https://github.com/scikit-learn/scikit-learn/issues/23319#issuecomment-1464933635 """ x = np.array( [ 2003.0, 1950.0, 1997.0, 2000.0, 2009.0, 2009.0, 1980.0, 1999.0, 2007.0, 1991.0, ] ) def _test_no_warnings(data): """Internal helper to test for unexpected warnings.""" with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always") # Ensure all warnings are captured PowerTransformer(method="yeo-johnson", standardize=True).fit_transform(data) assert not caught_warnings, "Unexpected warnings were raised:\n" + "\n".join( str(w.message) for w in caught_warnings ) # Full dataset: Should not trigger overflow in variance calculation. _test_no_warnings(x.reshape(-1, 1)) # Subset of data: Should not trigger overflow in power calculation. _test_no_warnings(x[:5].reshape(-1, 1)) def test_yeojohnson_for_different_scipy_version(): """Check that the results are consistent across different SciPy versions.""" pt = PowerTransformer(method="yeo-johnson").fit(X_1col) pt.lambdas_[0] == pytest.approx(0.99546157, rel=1e-7)