BodyBalanceEvaluation/backend/venv/Lib/site-packages/sklearn/tests/test_pipeline.py

2414 lines
79 KiB
Python
Raw Normal View History

2025-07-31 17:23:05 +08:00
"""
Test the pipeline module.
"""
import itertools
import re
import shutil
import time
from tempfile import mkdtemp
import joblib
import numpy as np
import pytest
from sklearn import config_context
from sklearn.base import (
BaseEstimator,
ClassifierMixin,
TransformerMixin,
clone,
is_classifier,
is_regressor,
)
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
HistGradientBoostingClassifier,
RandomForestClassifier,
RandomTreesEmbedding,
)
from sklearn.exceptions import NotFittedError, UnsetMetadataPassedError
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.svm import SVC
from sklearn.tests.metadata_routing_common import (
ConsumingNoFitTransformTransformer,
ConsumingTransformer,
_Registry,
check_recorded_metadata,
)
from sklearn.utils import get_tags
from sklearn.utils._metadata_requests import COMPOSITE_METHODS, METHODS
from sklearn.utils._testing import (
MinimalClassifier,
MinimalRegressor,
MinimalTransformer,
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS
from sklearn.utils.validation import _check_feature_names, check_is_fitted
# Load a shared tests data sets for the tests in this module. Mark them
# read-only to avoid unintentional in-place modifications that would introduce
# side-effects between tests.
iris = load_iris()
iris.data.flags.writeable = False
iris.target.flags.writeable = False
JUNK_FOOD_DOCS = (
"the pizza pizza beer copyright",
"the pizza burger beer copyright",
"the the pizza beer beer copyright",
"the burger beer beer copyright",
"the coke burger coke copyright",
"the coke burger burger",
)
class NoFit(BaseEstimator):
"""Small class to test parameter dispatching."""
def __init__(self, a=None, b=None):
self.a = a
self.b = b
class NoTrans(NoFit):
def fit(self, X, y=None):
return self
def get_params(self, deep=False):
return {"a": self.a, "b": self.b}
def set_params(self, **params):
self.a = params["a"]
return self
class NoInvTransf(TransformerMixin, NoTrans):
def transform(self, X):
return X
class Transf(NoInvTransf):
def transform(self, X):
return X
def inverse_transform(self, X):
return X
class TransfFitParams(Transf):
def fit(self, X, y=None, **fit_params):
self.fit_params = fit_params
return self
class Mult(TransformerMixin, BaseEstimator):
def __init__(self, mult=1):
self.mult = mult
def __sklearn_is_fitted__(self):
return True
def fit(self, X, y=None):
return self
def transform(self, X):
return np.asarray(X) * self.mult
def inverse_transform(self, X):
return np.asarray(X) / self.mult
def predict(self, X):
return (np.asarray(X) * self.mult).sum(axis=1)
predict_proba = predict_log_proba = decision_function = predict
def score(self, X, y=None):
return np.sum(X)
class FitParamT(BaseEstimator):
"""Mock classifier"""
def __init__(self):
self.successful = False
def fit(self, X, y, should_succeed=False):
self.successful = should_succeed
self.fitted_ = True
def predict(self, X):
return self.successful
def fit_predict(self, X, y, should_succeed=False):
self.fit(X, y, should_succeed=should_succeed)
return self.predict(X)
def score(self, X, y=None, sample_weight=None):
if sample_weight is not None:
X = X * sample_weight
return np.sum(X)
class DummyTransf(Transf):
"""Transformer which store the column means"""
def fit(self, X, y):
self.means_ = np.mean(X, axis=0)
# store timestamp to figure out whether the result of 'fit' has been
# cached or not
self.timestamp_ = time.time()
return self
class DummyEstimatorParams(BaseEstimator):
"""Mock classifier that takes params on predict"""
def __sklearn_is_fitted__(self):
return True
def fit(self, X, y):
return self
def predict(self, X, got_attribute=False):
self.got_attribute = got_attribute
return self
def predict_proba(self, X, got_attribute=False):
self.got_attribute = got_attribute
return self
def predict_log_proba(self, X, got_attribute=False):
self.got_attribute = got_attribute
return self
def test_pipeline_invalid_parameters():
# Test the various init parameters of the pipeline in fit
# method
pipeline = Pipeline([(1, 1)])
with pytest.raises(TypeError):
pipeline.fit([[1]], [1])
# Check that we can't fit pipelines with objects without fit
# method
msg = (
"Last step of Pipeline should implement fit "
"or be the string 'passthrough'"
".*NoFit.*"
)
pipeline = Pipeline([("clf", NoFit())])
with pytest.raises(TypeError, match=msg):
pipeline.fit([[1]], [1])
# Smoke test with only an estimator
clf = NoTrans()
pipe = Pipeline([("svc", clf)])
assert pipe.get_params(deep=True) == dict(
svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)
)
# Check that params are set
pipe.set_params(svc__a=0.1)
assert clf.a == 0.1
assert clf.b is None
# Smoke test the repr:
repr(pipe)
# Test with two objects
clf = SVC()
filter1 = SelectKBest(f_classif)
pipe = Pipeline([("anova", filter1), ("svc", clf)])
# Check that estimators are not cloned on pipeline construction
assert pipe.named_steps["anova"] is filter1
assert pipe.named_steps["svc"] is clf
# Check that we can't fit with non-transformers on the way
# Note that NoTrans implements fit, but not transform
msg = "All intermediate steps should be transformers.*\\bNoTrans\\b.*"
pipeline = Pipeline([("t", NoTrans()), ("svc", clf)])
with pytest.raises(TypeError, match=msg):
pipeline.fit([[1]], [1])
# Check that params are set
pipe.set_params(svc__C=0.1)
assert clf.C == 0.1
# Smoke test the repr:
repr(pipe)
# Check that params are not set when naming them wrong
msg = re.escape(
"Invalid parameter 'C' for estimator SelectKBest(). Valid parameters are: ['k',"
" 'score_func']."
)
with pytest.raises(ValueError, match=msg):
pipe.set_params(anova__C=0.1)
# Test clone
pipe2 = clone(pipe)
assert pipe.named_steps["svc"] is not pipe2.named_steps["svc"]
# Check that apart from estimators, the parameters are the same
params = pipe.get_params(deep=True)
params2 = pipe2.get_params(deep=True)
for x in pipe.get_params(deep=False):
params.pop(x)
for x in pipe2.get_params(deep=False):
params2.pop(x)
# Remove estimators that where copied
params.pop("svc")
params.pop("anova")
params2.pop("svc")
params2.pop("anova")
assert params == params2
def test_empty_pipeline():
X = iris.data
y = iris.target
pipe = Pipeline([])
msg = "The pipeline is empty. Please add steps."
with pytest.raises(ValueError, match=msg):
pipe.fit(X, y)
def test_pipeline_init_tuple():
# Pipeline accepts steps as tuple
X = np.array([[1, 2]])
pipe = Pipeline((("transf", Transf()), ("clf", FitParamT())))
pipe.fit(X, y=None)
pipe.score(X)
pipe.set_params(transf="passthrough")
pipe.fit(X, y=None)
pipe.score(X)
def test_pipeline_methods_anova():
# Test the various methods of the pipeline (anova).
X = iris.data
y = iris.target
# Test with Anova + LogisticRegression
clf = LogisticRegression()
filter1 = SelectKBest(f_classif, k=2)
pipe = Pipeline([("anova", filter1), ("logistic", clf)])
pipe.fit(X, y)
pipe.predict(X)
pipe.predict_proba(X)
pipe.predict_log_proba(X)
pipe.score(X, y)
def test_pipeline_fit_params():
# Test that the pipeline can take fit parameters
pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())])
pipe.fit(X=None, y=None, clf__should_succeed=True)
# classifier should return True
assert pipe.predict(None)
# and transformer params should not be changed
assert pipe.named_steps["transf"].a is None
assert pipe.named_steps["transf"].b is None
# invalid parameters should raise an error message
msg = re.escape("fit() got an unexpected keyword argument 'bad'")
with pytest.raises(TypeError, match=msg):
pipe.fit(None, None, clf__bad=True)
def test_pipeline_sample_weight_supported():
# Pipeline should pass sample_weight
X = np.array([[1, 2]])
pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())])
pipe.fit(X, y=None)
assert pipe.score(X) == 3
assert pipe.score(X, y=None) == 3
assert pipe.score(X, y=None, sample_weight=None) == 3
assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_sample_weight_unsupported():
# When sample_weight is None it shouldn't be passed
X = np.array([[1, 2]])
pipe = Pipeline([("transf", Transf()), ("clf", Mult())])
pipe.fit(X, y=None)
assert pipe.score(X) == 3
assert pipe.score(X, sample_weight=None) == 3
msg = re.escape("score() got an unexpected keyword argument 'sample_weight'")
with pytest.raises(TypeError, match=msg):
pipe.score(X, sample_weight=np.array([2, 3]))
def test_pipeline_raise_set_params_error():
# Test pipeline raises set params error message for nested models.
pipe = Pipeline([("cls", LinearRegression())])
# expected error message
error_msg = re.escape(
"Invalid parameter 'fake' for estimator Pipeline(steps=[('cls',"
" LinearRegression())]). Valid parameters are: ['memory', 'steps',"
" 'transform_input', 'verbose']."
)
with pytest.raises(ValueError, match=error_msg):
pipe.set_params(fake="nope")
# invalid outer parameter name for compound parameter: the expected error message
# is the same as above.
with pytest.raises(ValueError, match=error_msg):
pipe.set_params(fake__estimator="nope")
# expected error message for invalid inner parameter
error_msg = re.escape(
"Invalid parameter 'invalid_param' for estimator LinearRegression(). Valid"
" parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive', 'tol']."
)
with pytest.raises(ValueError, match=error_msg):
pipe.set_params(cls__invalid_param="nope")
def test_pipeline_methods_pca_svm():
# Test the various methods of the pipeline (pca + svm).
X = iris.data
y = iris.target
# Test with PCA + SVC
clf = SVC(probability=True, random_state=0)
pca = PCA(svd_solver="full", n_components="mle", whiten=True)
pipe = Pipeline([("pca", pca), ("svc", clf)])
pipe.fit(X, y)
pipe.predict(X)
pipe.predict_proba(X)
pipe.predict_log_proba(X)
pipe.score(X, y)
def test_pipeline_score_samples_pca_lof():
X = iris.data
# Test that the score_samples method is implemented on a pipeline.
# Test that the score_samples method on pipeline yields same results as
# applying transform and score_samples steps separately.
pca = PCA(svd_solver="full", n_components="mle", whiten=True)
lof = LocalOutlierFactor(novelty=True)
pipe = Pipeline([("pca", pca), ("lof", lof)])
pipe.fit(X)
# Check the shapes
assert pipe.score_samples(X).shape == (X.shape[0],)
# Check the values
lof.fit(pca.fit_transform(X))
assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def test_score_samples_on_pipeline_without_score_samples():
X = np.array([[1], [2]])
y = np.array([1, 2])
# Test that a pipeline does not have score_samples method when the final
# step of the pipeline does not have score_samples defined.
pipe = make_pipeline(LogisticRegression())
pipe.fit(X, y)
inner_msg = "'LogisticRegression' object has no attribute 'score_samples'"
outer_msg = "'Pipeline' has no attribute 'score_samples'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
pipe.score_samples(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
def test_pipeline_methods_preprocessing_svm():
# Test the various methods of the pipeline (preprocessing + svm).
X = iris.data
y = iris.target
n_samples = X.shape[0]
n_classes = len(np.unique(y))
scaler = StandardScaler()
pca = PCA(n_components=2, svd_solver="randomized", whiten=True)
clf = SVC(probability=True, random_state=0, decision_function_shape="ovr")
for preprocessing in [scaler, pca]:
pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)])
pipe.fit(X, y)
# check shapes of various prediction functions
predict = pipe.predict(X)
assert predict.shape == (n_samples,)
proba = pipe.predict_proba(X)
assert proba.shape == (n_samples, n_classes)
log_proba = pipe.predict_log_proba(X)
assert log_proba.shape == (n_samples, n_classes)
decision_function = pipe.decision_function(X)
assert decision_function.shape == (n_samples, n_classes)
pipe.score(X, y)
def test_fit_predict_on_pipeline():
# test that the fit_predict method is implemented on a pipeline
# test that the fit_predict on pipeline yields same results as applying
# transform and clustering steps separately
scaler = StandardScaler()
km = KMeans(random_state=0, n_init="auto")
# As pipeline doesn't clone estimators on construction,
# it must have its own estimators
scaler_for_pipeline = StandardScaler()
km_for_pipeline = KMeans(random_state=0, n_init="auto")
# first compute the transform and clustering step separately
scaled = scaler.fit_transform(iris.data)
separate_pred = km.fit_predict(scaled)
# use a pipeline to do the transform and clustering in one step
pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)])
pipeline_pred = pipe.fit_predict(iris.data)
assert_array_almost_equal(pipeline_pred, separate_pred)
def test_fit_predict_on_pipeline_without_fit_predict():
# tests that a pipeline does not have fit_predict method when final
# step of pipeline does not have fit_predict defined
scaler = StandardScaler()
pca = PCA(svd_solver="full")
pipe = Pipeline([("scaler", scaler), ("pca", pca)])
outer_msg = "'Pipeline' has no attribute 'fit_predict'"
inner_msg = "'PCA' object has no attribute 'fit_predict'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
getattr(pipe, "fit_predict")
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
def test_fit_predict_with_intermediate_fit_params():
# tests that Pipeline passes fit_params to intermediate steps
# when fit_predict is invoked
pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())])
pipe.fit_predict(
X=None, y=None, transf__should_get_this=True, clf__should_succeed=True
)
assert pipe.named_steps["transf"].fit_params["should_get_this"]
assert pipe.named_steps["clf"].successful
assert "should_succeed" not in pipe.named_steps["transf"].fit_params
@pytest.mark.parametrize(
"method_name", ["predict", "predict_proba", "predict_log_proba"]
)
def test_predict_methods_with_predict_params(method_name):
# tests that Pipeline passes predict_* to the final estimator
# when predict_* is invoked
pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())])
pipe.fit(None, None)
method = getattr(pipe, method_name)
method(X=None, got_attribute=True)
assert pipe.named_steps["clf"].got_attribute
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_feature_union(csr_container):
# basic sanity check for feature union
X = iris.data.copy()
X -= X.mean(axis=0)
y = iris.target
svd = TruncatedSVD(n_components=2, random_state=0)
select = SelectKBest(k=1)
fs = FeatureUnion([("svd", svd), ("select", select)])
fs.fit(X, y)
X_transformed = fs.transform(X)
assert X_transformed.shape == (X.shape[0], 3)
# check if it does the expected thing
assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
# test if it also works for sparse input
# We use a different svd object to control the random_state stream
fs = FeatureUnion([("svd", svd), ("select", select)])
X_sp = csr_container(X)
X_sp_transformed = fs.fit_transform(X_sp, y)
assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
# Test clone
fs2 = clone(fs)
assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1]
# test setting parameters
fs.set_params(select__k=2)
assert fs.fit_transform(X, y).shape == (X.shape[0], 4)
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
X_transformed = fs.fit_transform(X, y)
assert X_transformed.shape == (X.shape[0], 8)
# test error if some elements do not support transform
msg = "All estimators should implement fit and transform.*\\bNoTrans\\b"
fs = FeatureUnion([("transform", Transf()), ("no_transform", NoTrans())])
with pytest.raises(TypeError, match=msg):
fs.fit(X)
# test that init accepts tuples
fs = FeatureUnion((("svd", svd), ("select", select)))
fs.fit(X, y)
def test_feature_union_named_transformers():
"""Check the behaviour of `named_transformers` attribute."""
transf = Transf()
noinvtransf = NoInvTransf()
fs = FeatureUnion([("transf", transf), ("noinvtransf", noinvtransf)])
assert fs.named_transformers["transf"] == transf
assert fs.named_transformers["noinvtransf"] == noinvtransf
# test named attribute
assert fs.named_transformers.transf == transf
assert fs.named_transformers.noinvtransf == noinvtransf
def test_make_union():
pca = PCA(svd_solver="full")
mock = Transf()
fu = make_union(pca, mock)
names, transformers = zip(*fu.transformer_list)
assert names == ("pca", "transf")
assert transformers == (pca, mock)
def test_make_union_kwargs():
pca = PCA(svd_solver="full")
mock = Transf()
fu = make_union(pca, mock, n_jobs=3)
assert fu.transformer_list == make_union(pca, mock).transformer_list
assert 3 == fu.n_jobs
# invalid keyword parameters should raise an error message
msg = re.escape(
"make_union() got an unexpected keyword argument 'transformer_weights'"
)
with pytest.raises(TypeError, match=msg):
make_union(pca, mock, transformer_weights={"pca": 10, "Transf": 1})
def create_mock_transformer(base_name, n_features=3):
"""Helper to create a mock transformer with custom feature names."""
mock = Transf()
mock.get_feature_names_out = lambda input_features: [
f"{base_name}{i}" for i in range(n_features)
]
return mock
def test_make_union_passes_verbose_feature_names_out():
# Test that make_union passes verbose_feature_names_out
# to the FeatureUnion.
X = iris.data
y = iris.target
pca = PCA()
mock = create_mock_transformer("transf")
union = make_union(pca, mock, verbose_feature_names_out=False)
assert not union.verbose_feature_names_out
fu_union = make_union(pca, mock, verbose_feature_names_out=True)
fu_union.fit(X, y)
assert_array_equal(
[
"pca__pca0",
"pca__pca1",
"pca__pca2",
"pca__pca3",
"transf__transf0",
"transf__transf1",
"transf__transf2",
],
fu_union.get_feature_names_out(),
)
def test_pipeline_transform():
# Test whether pipeline works with a transformer at the end.
# Also test pipeline.transform and pipeline.inverse_transform
X = iris.data
pca = PCA(n_components=2, svd_solver="full")
pipeline = Pipeline([("pca", pca)])
# test transform and fit_transform:
X_trans = pipeline.fit(X).transform(X)
X_trans2 = pipeline.fit_transform(X)
X_trans3 = pca.fit_transform(X)
assert_array_almost_equal(X_trans, X_trans2)
assert_array_almost_equal(X_trans, X_trans3)
X_back = pipeline.inverse_transform(X_trans)
X_back2 = pca.inverse_transform(X_trans)
assert_array_almost_equal(X_back, X_back2)
def test_pipeline_fit_transform():
# Test whether pipeline works with a transformer missing fit_transform
X = iris.data
y = iris.target
transf = Transf()
pipeline = Pipeline([("mock", transf)])
# test fit_transform:
X_trans = pipeline.fit_transform(X, y)
X_trans2 = transf.fit(X, y).transform(X)
assert_array_almost_equal(X_trans, X_trans2)
@pytest.mark.parametrize(
"start, end", [(0, 1), (0, 2), (1, 2), (1, 3), (None, 1), (1, None), (None, None)]
)
def test_pipeline_slice(start, end):
pipe = Pipeline(
[("transf1", Transf()), ("transf2", Transf()), ("clf", FitParamT())],
memory="123",
verbose=True,
)
pipe_slice = pipe[start:end]
# Test class
assert isinstance(pipe_slice, Pipeline)
# Test steps
assert pipe_slice.steps == pipe.steps[start:end]
# Test named_steps attribute
assert (
list(pipe_slice.named_steps.items())
== list(pipe.named_steps.items())[start:end]
)
# Test the rest of the parameters
pipe_params = pipe.get_params(deep=False)
pipe_slice_params = pipe_slice.get_params(deep=False)
del pipe_params["steps"]
del pipe_slice_params["steps"]
assert pipe_params == pipe_slice_params
# Test exception
msg = "Pipeline slicing only supports a step of 1"
with pytest.raises(ValueError, match=msg):
pipe[start:end:-1]
def test_pipeline_index():
transf = Transf()
clf = FitParamT()
pipe = Pipeline([("transf", transf), ("clf", clf)])
assert pipe[0] == transf
assert pipe["transf"] == transf
assert pipe[-1] == clf
assert pipe["clf"] == clf
# should raise an error if slicing out of range
with pytest.raises(IndexError):
pipe[3]
# should raise an error if indexing with wrong element name
with pytest.raises(KeyError):
pipe["foobar"]
def test_set_pipeline_steps():
transf1 = Transf()
transf2 = Transf()
pipeline = Pipeline([("mock", transf1)])
assert pipeline.named_steps["mock"] is transf1
# Directly setting attr
pipeline.steps = [("mock2", transf2)]
assert "mock" not in pipeline.named_steps
assert pipeline.named_steps["mock2"] is transf2
assert [("mock2", transf2)] == pipeline.steps
# Using set_params
pipeline.set_params(steps=[("mock", transf1)])
assert [("mock", transf1)] == pipeline.steps
# Using set_params to replace single step
pipeline.set_params(mock=transf2)
assert [("mock", transf2)] == pipeline.steps
# With invalid data
pipeline.set_params(steps=[("junk", ())])
msg = re.escape(
"Last step of Pipeline should implement fit or be the string 'passthrough'."
)
with pytest.raises(TypeError, match=msg):
pipeline.fit([[1]], [1])
msg = "This 'Pipeline' has no attribute 'fit_transform'"
with pytest.raises(AttributeError, match=msg):
pipeline.fit_transform([[1]], [1])
def test_pipeline_named_steps():
transf = Transf()
mult2 = Mult(mult=2)
pipeline = Pipeline([("mock", transf), ("mult", mult2)])
# Test access via named_steps bunch object
assert "mock" in pipeline.named_steps
assert "mock2" not in pipeline.named_steps
assert pipeline.named_steps.mock is transf
assert pipeline.named_steps.mult is mult2
# Test bunch with conflict attribute of dict
pipeline = Pipeline([("values", transf), ("mult", mult2)])
assert pipeline.named_steps.values is not transf
assert pipeline.named_steps.mult is mult2
@pytest.mark.parametrize("passthrough", [None, "passthrough"])
def test_pipeline_correctly_adjusts_steps(passthrough):
X = np.array([[1]])
y = np.array([1])
mult2 = Mult(mult=2)
mult3 = Mult(mult=3)
mult5 = Mult(mult=5)
pipeline = Pipeline(
[("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)]
)
pipeline.fit(X, y)
expected_names = ["m2", "bad", "m3", "m5"]
actual_names = [name for name, _ in pipeline.steps]
assert expected_names == actual_names
@pytest.mark.parametrize("passthrough", [None, "passthrough"])
def test_set_pipeline_step_passthrough(passthrough):
X = np.array([[1]])
y = np.array([1])
mult2 = Mult(mult=2)
mult3 = Mult(mult=3)
mult5 = Mult(mult=5)
def make():
return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])
pipeline = make()
exp = 2 * 3 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
pipeline.set_params(m3=passthrough)
exp = 2 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
assert pipeline.get_params(deep=True) == {
"steps": pipeline.steps,
"m2": mult2,
"m3": passthrough,
"last": mult5,
"memory": None,
"m2__mult": 2,
"last__mult": 5,
"transform_input": None,
"verbose": False,
}
pipeline.set_params(m2=passthrough)
exp = 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
# for other methods, ensure no AttributeErrors on None:
other_methods = [
"predict_proba",
"predict_log_proba",
"decision_function",
"transform",
"score",
]
for method in other_methods:
getattr(pipeline, method)(X)
pipeline.set_params(m2=mult2)
exp = 2 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
pipeline = make()
pipeline.set_params(last=passthrough)
# mult2 and mult3 are active
exp = 6
assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
inner_msg = "'str' object has no attribute 'predict'"
outer_msg = "This 'Pipeline' has no attribute 'predict'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
getattr(pipeline, "predict")
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
# Check 'passthrough' step at construction time
exp = 2 * 5
pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)])
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_pipeline_ducktyping():
pipeline = make_pipeline(Mult(5))
pipeline.predict
pipeline.transform
pipeline.inverse_transform
pipeline = make_pipeline(Transf())
assert not hasattr(pipeline, "predict")
pipeline.transform
pipeline.inverse_transform
pipeline = make_pipeline("passthrough")
assert pipeline.steps[0] == ("passthrough", "passthrough")
assert not hasattr(pipeline, "predict")
pipeline.transform
pipeline.inverse_transform
pipeline = make_pipeline(Transf(), NoInvTransf())
assert not hasattr(pipeline, "predict")
pipeline.transform
assert not hasattr(pipeline, "inverse_transform")
pipeline = make_pipeline(NoInvTransf(), Transf())
assert not hasattr(pipeline, "predict")
pipeline.transform
assert not hasattr(pipeline, "inverse_transform")
def test_make_pipeline():
t1 = Transf()
t2 = Transf()
pipe = make_pipeline(t1, t2)
assert isinstance(pipe, Pipeline)
assert pipe.steps[0][0] == "transf-1"
assert pipe.steps[1][0] == "transf-2"
pipe = make_pipeline(t1, t2, FitParamT())
assert isinstance(pipe, Pipeline)
assert pipe.steps[0][0] == "transf-1"
assert pipe.steps[1][0] == "transf-2"
assert pipe.steps[2][0] == "fitparamt"
@pytest.mark.parametrize(
"pipeline, check_estimator_type",
[
(make_pipeline(StandardScaler(), LogisticRegression()), is_classifier),
(make_pipeline(StandardScaler(), LinearRegression()), is_regressor),
(
make_pipeline(StandardScaler()),
lambda est: get_tags(est).estimator_type is None,
),
(Pipeline([]), lambda est: est._estimator_type is None),
],
)
def test_pipeline_estimator_type(pipeline, check_estimator_type):
"""Check that the estimator type returned by the pipeline is correct.
Non-regression test as part of:
https://github.com/scikit-learn/scikit-learn/issues/30197
"""
# Smoke test the repr
repr(pipeline)
assert check_estimator_type(pipeline)
def test_sklearn_tags_with_empty_pipeline():
"""Check that we propagate properly the tags in a Pipeline.
Non-regression test as part of:
https://github.com/scikit-learn/scikit-learn/issues/30197
"""
empty_pipeline = Pipeline(steps=[])
be = BaseEstimator()
expected_tags = be.__sklearn_tags__()
assert empty_pipeline.__sklearn_tags__() == expected_tags
def test_feature_union_weights():
# test feature union with transformer weights
X = iris.data
y = iris.target
pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
select = SelectKBest(k=1)
# test using fit followed by transform
fs = FeatureUnion(
[("pca", pca), ("select", select)], transformer_weights={"pca": 10}
)
fs.fit(X, y)
X_transformed = fs.transform(X)
# test using fit_transform
fs = FeatureUnion(
[("pca", pca), ("select", select)], transformer_weights={"pca": 10}
)
X_fit_transformed = fs.fit_transform(X, y)
# test it works with transformers missing fit_transform
fs = FeatureUnion(
[("mock", Transf()), ("pca", pca), ("select", select)],
transformer_weights={"mock": 10},
)
X_fit_transformed_wo_method = fs.fit_transform(X, y)
# check against expected result
# We use a different pca object to control the random_state stream
assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X))
assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel())
assert X_fit_transformed_wo_method.shape == (X.shape[0], 7)
def test_feature_union_parallel():
# test that n_jobs work for FeatureUnion
X = JUNK_FOOD_DOCS
fs = FeatureUnion(
[
("words", CountVectorizer(analyzer="word")),
("chars", CountVectorizer(analyzer="char")),
]
)
fs_parallel = FeatureUnion(
[
("words", CountVectorizer(analyzer="word")),
("chars", CountVectorizer(analyzer="char")),
],
n_jobs=2,
)
fs_parallel2 = FeatureUnion(
[
("words", CountVectorizer(analyzer="word")),
("chars", CountVectorizer(analyzer="char")),
],
n_jobs=2,
)
fs.fit(X)
X_transformed = fs.transform(X)
assert X_transformed.shape[0] == len(X)
fs_parallel.fit(X)
X_transformed_parallel = fs_parallel.transform(X)
assert X_transformed.shape == X_transformed_parallel.shape
assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray())
# fit_transform should behave the same
X_transformed_parallel2 = fs_parallel2.fit_transform(X)
assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
# transformers should stay fit after fit_transform
X_transformed_parallel2 = fs_parallel2.transform(X)
assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
def test_feature_union_feature_names():
word_vect = CountVectorizer(analyzer="word")
char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
ft.fit(JUNK_FOOD_DOCS)
feature_names = ft.get_feature_names_out()
for feat in feature_names:
assert "chars__" in feat or "words__" in feat
assert len(feature_names) == 35
ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
msg = re.escape(
"Transformer tr1 (type Transf) does not provide get_feature_names_out"
)
with pytest.raises(AttributeError, match=msg):
ft.get_feature_names_out()
def test_classes_property():
X = iris.data
y = iris.target
reg = make_pipeline(SelectKBest(k=1), LinearRegression())
reg.fit(X, y)
with pytest.raises(AttributeError):
getattr(reg, "classes_")
clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
with pytest.raises(AttributeError):
getattr(clf, "classes_")
clf.fit(X, y)
assert_array_equal(clf.classes_, np.unique(y))
def test_set_feature_union_steps():
mult2 = Mult(2)
mult3 = Mult(3)
mult5 = Mult(5)
mult3.get_feature_names_out = lambda input_features: ["x3"]
mult2.get_feature_names_out = lambda input_features: ["x2"]
mult5.get_feature_names_out = lambda input_features: ["x5"]
ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out())
# Directly setting attr
ft.transformer_list = [("m5", mult5)]
assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
assert_array_equal(["m5__x5"], ft.get_feature_names_out())
# Using set_params
ft.set_params(transformer_list=[("mock", mult3)])
assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
assert_array_equal(["mock__x3"], ft.get_feature_names_out())
# Using set_params to replace single step
ft.set_params(mock=mult5)
assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
assert_array_equal(["mock__x5"], ft.get_feature_names_out())
def test_set_feature_union_step_drop():
mult2 = Mult(2)
mult3 = Mult(3)
mult2.get_feature_names_out = lambda input_features: ["x2"]
mult3.get_feature_names_out = lambda input_features: ["x3"]
X = np.asarray([[1]])
ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
assert_array_equal([[2, 3]], ft.fit(X).transform(X))
assert_array_equal([[2, 3]], ft.fit_transform(X))
assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out())
ft.set_params(m2="drop")
assert_array_equal([[3]], ft.fit(X).transform(X))
assert_array_equal([[3]], ft.fit_transform(X))
assert_array_equal(["m3__x3"], ft.get_feature_names_out())
ft.set_params(m3="drop")
assert_array_equal([[]], ft.fit(X).transform(X))
assert_array_equal([[]], ft.fit_transform(X))
assert_array_equal([], ft.get_feature_names_out())
# check we can change back
ft.set_params(m3=mult3)
assert_array_equal([[3]], ft.fit(X).transform(X))
# Check 'drop' step at construction time
ft = FeatureUnion([("m2", "drop"), ("m3", mult3)])
assert_array_equal([[3]], ft.fit(X).transform(X))
assert_array_equal([[3]], ft.fit_transform(X))
assert_array_equal(["m3__x3"], ft.get_feature_names_out())
def test_set_feature_union_passthrough():
"""Check the behaviour of setting a transformer to `"passthrough"`."""
mult2 = Mult(2)
mult3 = Mult(3)
# We only test get_features_names_out, as get_feature_names is unsupported by
# FunctionTransformer, and hence unsupported by FeatureUnion passthrough.
mult2.get_feature_names_out = lambda input_features: ["x2"]
mult3.get_feature_names_out = lambda input_features: ["x3"]
X = np.asarray([[1]])
ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
assert_array_equal([[2, 3]], ft.fit(X).transform(X))
assert_array_equal([[2, 3]], ft.fit_transform(X))
assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out())
ft.set_params(m2="passthrough")
assert_array_equal([[1, 3]], ft.fit(X).transform(X))
assert_array_equal([[1, 3]], ft.fit_transform(X))
assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"]))
ft.set_params(m3="passthrough")
assert_array_equal([[1, 1]], ft.fit(X).transform(X))
assert_array_equal([[1, 1]], ft.fit_transform(X))
assert_array_equal(
["m2__myfeat", "m3__myfeat"], ft.get_feature_names_out(["myfeat"])
)
# check we can change back
ft.set_params(m3=mult3)
assert_array_equal([[1, 3]], ft.fit(X).transform(X))
assert_array_equal([[1, 3]], ft.fit_transform(X))
assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"]))
# Check 'passthrough' step at construction time
ft = FeatureUnion([("m2", "passthrough"), ("m3", mult3)])
assert_array_equal([[1, 3]], ft.fit(X).transform(X))
assert_array_equal([[1, 3]], ft.fit_transform(X))
assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"]))
X = iris.data
columns = X.shape[1]
pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
ft = FeatureUnion([("passthrough", "passthrough"), ("pca", pca)])
assert_array_equal(X, ft.fit(X).transform(X)[:, :columns])
assert_array_equal(X, ft.fit_transform(X)[:, :columns])
assert_array_equal(
[
"passthrough__f0",
"passthrough__f1",
"passthrough__f2",
"passthrough__f3",
"pca__pca0",
"pca__pca1",
],
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
)
ft.set_params(pca="passthrough")
X_ft = ft.fit(X).transform(X)
assert_array_equal(X_ft, np.hstack([X, X]))
X_ft = ft.fit_transform(X)
assert_array_equal(X_ft, np.hstack([X, X]))
assert_array_equal(
[
"passthrough__f0",
"passthrough__f1",
"passthrough__f2",
"passthrough__f3",
"pca__f0",
"pca__f1",
"pca__f2",
"pca__f3",
],
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
)
ft.set_params(passthrough=pca)
assert_array_equal(X, ft.fit(X).transform(X)[:, -columns:])
assert_array_equal(X, ft.fit_transform(X)[:, -columns:])
assert_array_equal(
[
"passthrough__pca0",
"passthrough__pca1",
"pca__f0",
"pca__f1",
"pca__f2",
"pca__f3",
],
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
)
ft = FeatureUnion(
[("passthrough", "passthrough"), ("pca", pca)],
transformer_weights={"passthrough": 2},
)
assert_array_equal(X * 2, ft.fit(X).transform(X)[:, :columns])
assert_array_equal(X * 2, ft.fit_transform(X)[:, :columns])
assert_array_equal(
[
"passthrough__f0",
"passthrough__f1",
"passthrough__f2",
"passthrough__f3",
"pca__pca0",
"pca__pca1",
],
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]),
)
def test_feature_union_passthrough_get_feature_names_out_true():
"""Check feature_names_out for verbose_feature_names_out=True (default)"""
X = iris.data
pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
ft = FeatureUnion([("pca", pca), ("passthrough", "passthrough")])
ft.fit(X)
assert_array_equal(
[
"pca__pca0",
"pca__pca1",
"passthrough__x0",
"passthrough__x1",
"passthrough__x2",
"passthrough__x3",
],
ft.get_feature_names_out(),
)
def test_feature_union_passthrough_get_feature_names_out_false():
"""Check feature_names_out for verbose_feature_names_out=False"""
X = iris.data
pca = PCA(n_components=2, svd_solver="randomized", random_state=0)
ft = FeatureUnion(
[("pca", pca), ("passthrough", "passthrough")], verbose_feature_names_out=False
)
ft.fit(X)
assert_array_equal(
[
"pca0",
"pca1",
"x0",
"x1",
"x2",
"x3",
],
ft.get_feature_names_out(),
)
def test_feature_union_passthrough_get_feature_names_out_false_errors():
"""Check get_feature_names_out and non-verbose names and colliding names."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame([[1, 2], [2, 3]], columns=["a", "b"])
select_a = FunctionTransformer(
lambda X: X[["a"]], feature_names_out=lambda self, _: np.asarray(["a"])
)
union = FeatureUnion(
[("t1", StandardScaler()), ("t2", select_a)],
verbose_feature_names_out=False,
)
union.fit(X)
msg = re.escape(
"Output feature names: ['a'] are not unique. "
"Please set verbose_feature_names_out=True to add prefixes to feature names"
)
with pytest.raises(ValueError, match=msg):
union.get_feature_names_out()
def test_feature_union_passthrough_get_feature_names_out_false_errors_overlap_over_5():
"""Check get_feature_names_out with non-verbose names and >= 5 colliding names."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame([list(range(10))], columns=[f"f{i}" for i in range(10)])
union = FeatureUnion(
[("t1", "passthrough"), ("t2", "passthrough")],
verbose_feature_names_out=False,
)
union.fit(X)
msg = re.escape(
"Output feature names: ['f0', 'f1', 'f2', 'f3', 'f4', ...] "
"are not unique. Please set verbose_feature_names_out=True to add prefixes to"
" feature names"
)
with pytest.raises(ValueError, match=msg):
union.get_feature_names_out()
def test_step_name_validation():
error_message_1 = r"Estimator names must not contain __: got \['a__q'\]"
error_message_2 = r"Names provided are not unique: \['a', 'a'\]"
error_message_3 = r"Estimator names conflict with constructor arguments: \['%s'\]"
bad_steps1 = [("a__q", Mult(2)), ("b", Mult(3))]
bad_steps2 = [("a", Mult(2)), ("a", Mult(3))]
for cls, param in [(Pipeline, "steps"), (FeatureUnion, "transformer_list")]:
# we validate in construction (despite scikit-learn convention)
bad_steps3 = [("a", Mult(2)), (param, Mult(3))]
for bad_steps, message in [
(bad_steps1, error_message_1),
(bad_steps2, error_message_2),
(bad_steps3, error_message_3 % param),
]:
# three ways to make invalid:
# - construction
with pytest.raises(ValueError, match=message):
cls(**{param: bad_steps}).fit([[1]], [1])
# - setattr
est = cls(**{param: [("a", Mult(1))]})
setattr(est, param, bad_steps)
with pytest.raises(ValueError, match=message):
est.fit([[1]], [1])
with pytest.raises(ValueError, match=message):
est.fit_transform([[1]], [1])
# - set_params
est = cls(**{param: [("a", Mult(1))]})
est.set_params(**{param: bad_steps})
with pytest.raises(ValueError, match=message):
est.fit([[1]], [1])
with pytest.raises(ValueError, match=message):
est.fit_transform([[1]], [1])
def test_set_params_nested_pipeline():
estimator = Pipeline([("a", Pipeline([("b", DummyRegressor())]))])
estimator.set_params(a__b__alpha=0.001, a__b=Lasso())
estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5)
def test_pipeline_memory():
X = iris.data
y = iris.target
cachedir = mkdtemp()
try:
memory = joblib.Memory(location=cachedir, verbose=10)
# Test with Transformer + SVC
clf = SVC(probability=True, random_state=0)
transf = DummyTransf()
pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory)
# Memoize the transformer at the first fit
cached_pipe.fit(X, y)
pipe.fit(X, y)
# Get the time stamp of the transformer in the cached pipeline
ts = cached_pipe.named_steps["transf"].timestamp_
# Check that cached_pipe and pipe yield identical results
assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
assert_array_equal(
pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_
)
assert not hasattr(transf, "means_")
# Check that we are reading the cache while fitting
# a second time
cached_pipe.fit(X, y)
# Check that cached_pipe and pipe yield identical results
assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
assert_array_equal(
pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_
)
assert ts == cached_pipe.named_steps["transf"].timestamp_
# Create a new pipeline with cloned estimators
# Check that even changing the name step does not affect the cache hit
clf_2 = SVC(probability=True, random_state=0)
transf_2 = DummyTransf()
cached_pipe_2 = Pipeline(
[("transf_2", transf_2), ("svc", clf_2)], memory=memory
)
cached_pipe_2.fit(X, y)
# Check that cached_pipe and pipe yield identical results
assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X))
assert_array_equal(
pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)
)
assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
assert_array_equal(
pipe.named_steps["transf"].means_,
cached_pipe_2.named_steps["transf_2"].means_,
)
assert ts == cached_pipe_2.named_steps["transf_2"].timestamp_
finally:
shutil.rmtree(cachedir)
def test_make_pipeline_memory():
cachedir = mkdtemp()
memory = joblib.Memory(location=cachedir, verbose=10)
pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
assert pipeline.memory is memory
pipeline = make_pipeline(DummyTransf(), SVC())
assert pipeline.memory is None
assert len(pipeline) == 2
shutil.rmtree(cachedir)
class FeatureNameSaver(BaseEstimator):
def fit(self, X, y=None):
_check_feature_names(self, X, reset=True)
return self
def transform(self, X, y=None):
return X
def get_feature_names_out(self, input_features=None):
return input_features
def test_features_names_passthrough():
"""Check pipeline.get_feature_names_out with passthrough"""
pipe = Pipeline(
steps=[
("names", FeatureNameSaver()),
("pass", "passthrough"),
("clf", LogisticRegression()),
]
)
iris = load_iris()
pipe.fit(iris.data, iris.target)
assert_array_equal(
pipe[:-1].get_feature_names_out(iris.feature_names), iris.feature_names
)
def test_feature_names_count_vectorizer():
"""Check pipeline.get_feature_names_out with vectorizers"""
pipe = Pipeline(steps=[("vect", CountVectorizer()), ("clf", LogisticRegression())])
y = ["pizza" in x for x in JUNK_FOOD_DOCS]
pipe.fit(JUNK_FOOD_DOCS, y)
assert_array_equal(
pipe[:-1].get_feature_names_out(),
["beer", "burger", "coke", "copyright", "pizza", "the"],
)
assert_array_equal(
pipe[:-1].get_feature_names_out("nonsense_is_ignored"),
["beer", "burger", "coke", "copyright", "pizza", "the"],
)
def test_pipeline_feature_names_out_error_without_definition():
"""Check that error is raised when a transformer does not define
`get_feature_names_out`."""
pipe = Pipeline(steps=[("notrans", NoTrans())])
iris = load_iris()
pipe.fit(iris.data, iris.target)
msg = "does not provide get_feature_names_out"
with pytest.raises(AttributeError, match=msg):
pipe.get_feature_names_out()
def test_pipeline_param_error():
clf = make_pipeline(LogisticRegression())
with pytest.raises(
ValueError, match="Pipeline.fit does not accept the sample_weight parameter"
):
clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])
parameter_grid_test_verbose = (
(est, pattern, method)
for (est, pattern), method in itertools.product(
[
(
Pipeline([("transf", Transf()), ("clf", FitParamT())]),
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$",
),
(
Pipeline([("transf", Transf()), ("noop", None), ("clf", FitParamT())]),
r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n"
r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n"
r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$",
),
(
Pipeline(
[
("transf", Transf()),
("noop", "passthrough"),
("clf", FitParamT()),
]
),
r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n"
r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n"
r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$",
),
(
Pipeline([("transf", Transf()), ("clf", None)]),
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$",
),
(
Pipeline([("transf", None), ("mult", Mult())]),
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$",
),
(
Pipeline([("transf", "passthrough"), ("mult", Mult())]),
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$",
),
(
FeatureUnion([("mult1", Mult()), ("mult2", Mult())]),
r"\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n"
r"\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$",
),
(
FeatureUnion([("mult1", "drop"), ("mult2", Mult()), ("mult3", "drop")]),
r"\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$",
),
],
["fit", "fit_transform", "fit_predict"],
)
if hasattr(est, method)
and not (
method == "fit_transform"
and hasattr(est, "steps")
and isinstance(est.steps[-1][1], FitParamT)
)
)
@pytest.mark.parametrize("est, pattern, method", parameter_grid_test_verbose)
def test_verbose(est, method, pattern, capsys):
func = getattr(est, method)
X = [[1, 2, 3], [4, 5, 6]]
y = [[7], [8]]
est.set_params(verbose=False)
func(X, y)
assert not capsys.readouterr().out, "Got output for verbose=False"
est.set_params(verbose=True)
func(X, y)
assert re.match(pattern, capsys.readouterr().out)
def test_n_features_in_pipeline():
# make sure pipelines delegate n_features_in to the first step
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
ss = StandardScaler()
gbdt = HistGradientBoostingClassifier()
pipe = make_pipeline(ss, gbdt)
assert not hasattr(pipe, "n_features_in_")
pipe.fit(X, y)
assert pipe.n_features_in_ == ss.n_features_in_ == 2
# if the first step has the n_features_in attribute then the pipeline also
# has it, even though it isn't fitted.
ss = StandardScaler()
gbdt = HistGradientBoostingClassifier()
pipe = make_pipeline(ss, gbdt)
ss.fit(X, y)
assert pipe.n_features_in_ == ss.n_features_in_ == 2
assert not hasattr(gbdt, "n_features_in_")
def test_n_features_in_feature_union():
# make sure FeatureUnion delegates n_features_in to the first transformer
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
ss = StandardScaler()
fu = make_union(ss)
assert not hasattr(fu, "n_features_in_")
fu.fit(X, y)
assert fu.n_features_in_ == ss.n_features_in_ == 2
# if the first step has the n_features_in attribute then the feature_union
# also has it, even though it isn't fitted.
ss = StandardScaler()
fu = make_union(ss)
ss.fit(X, y)
assert fu.n_features_in_ == ss.n_features_in_ == 2
def test_feature_union_fit_params():
# Regression test for issue: #15117
class DummyTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None, **fit_params):
if fit_params != {"a": 0}:
raise ValueError
return self
def transform(self, X, y=None):
return X
X, y = iris.data, iris.target
t = FeatureUnion([("dummy0", DummyTransformer()), ("dummy1", DummyTransformer())])
with pytest.raises(ValueError):
t.fit(X, y)
with pytest.raises(ValueError):
t.fit_transform(X, y)
t.fit(X, y, a=0)
t.fit_transform(X, y, a=0)
def test_feature_union_fit_params_without_fit_transform():
# Test that metadata is passed correctly to underlying transformers that don't
# implement a `fit_transform` method when SLEP6 is not enabled.
class DummyTransformer(ConsumingNoFitTransformTransformer):
def fit(self, X, y=None, **fit_params):
if fit_params != {"metadata": 1}:
raise ValueError
return self
X, y = iris.data, iris.target
t = FeatureUnion(
[
("nofittransform0", DummyTransformer()),
("nofittransform1", DummyTransformer()),
]
)
with pytest.raises(ValueError):
t.fit_transform(X, y, metadata=0)
t.fit_transform(X, y, metadata=1)
def test_pipeline_missing_values_leniency():
# check that pipeline let the missing values validation to
# the underlying transformers and predictors.
X, y = iris.data.copy(), iris.target.copy()
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), LogisticRegression())
assert pipe.fit(X, y).score(X, y) > 0.4
def test_feature_union_warns_unknown_transformer_weight():
# Warn user when transformer_weights containers a key not present in
# transformer_list
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
transformer_list = [("transf", Transf())]
# Transformer weights dictionary with incorrect name
weights = {"transformer": 1}
expected_msg = (
'Attempting to weight transformer "transformer", '
"but it is not present in transformer_list."
)
union = FeatureUnion(transformer_list, transformer_weights=weights)
with pytest.raises(ValueError, match=expected_msg):
union.fit(X, y)
@pytest.mark.parametrize("passthrough", [None, "passthrough"])
def test_pipeline_get_tags_none(passthrough):
# Checks that tags are set correctly when the first transformer is None or
# 'passthrough'
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/18815
pipe = make_pipeline(passthrough, SVC())
assert not pipe.__sklearn_tags__().input_tags.pairwise
# FIXME: Replace this test with a full `check_estimator` once we have API only
# checks.
@pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier])
def test_search_cv_using_minimal_compatible_estimator(Predictor):
# Check that third-party library estimators can be part of a pipeline
# and tuned by grid-search without inheriting from BaseEstimator.
rng = np.random.RandomState(0)
X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20)
model = Pipeline(
[("transformer", MinimalTransformer()), ("predictor", Predictor())]
)
model.fit(X, y)
y_pred = model.predict(X)
if is_classifier(model):
assert_array_equal(y_pred, 1)
assert model.score(X, y) == pytest.approx(accuracy_score(y, y_pred))
else:
assert_allclose(y_pred, y.mean())
assert model.score(X, y) == pytest.approx(r2_score(y, y_pred))
def test_pipeline_check_if_fitted():
class Estimator(BaseEstimator):
def fit(self, X, y):
self.fitted_ = True
return self
pipeline = Pipeline([("clf", Estimator())])
with pytest.raises(NotFittedError):
check_is_fitted(pipeline)
pipeline.fit(iris.data, iris.target)
check_is_fitted(pipeline)
def test_feature_union_check_if_fitted():
"""Check __sklearn_is_fitted__ is defined correctly."""
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
union = FeatureUnion([("clf", MinimalTransformer())])
with pytest.raises(NotFittedError):
check_is_fitted(union)
union.fit(X, y)
check_is_fitted(union)
# passthrough is stateless
union = FeatureUnion([("pass", "passthrough")])
check_is_fitted(union)
union = FeatureUnion([("clf", MinimalTransformer()), ("pass", "passthrough")])
with pytest.raises(NotFittedError):
check_is_fitted(union)
union.fit(X, y)
check_is_fitted(union)
def test_pipeline_get_feature_names_out_passes_names_through():
"""Check that pipeline passes names through.
Non-regresion test for #21349.
"""
X, y = iris.data, iris.target
class AddPrefixStandardScalar(StandardScaler):
def get_feature_names_out(self, input_features=None):
names = super().get_feature_names_out(input_features=input_features)
return np.asarray([f"my_prefix_{name}" for name in names], dtype=object)
pipe = make_pipeline(AddPrefixStandardScalar(), StandardScaler())
pipe.fit(X, y)
input_names = iris.feature_names
feature_names_out = pipe.get_feature_names_out(input_names)
assert_array_equal(feature_names_out, [f"my_prefix_{name}" for name in input_names])
def test_pipeline_set_output_integration():
"""Test pipeline's set_output with feature names."""
pytest.importorskip("pandas")
X, y = load_iris(as_frame=True, return_X_y=True)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.set_output(transform="pandas")
pipe.fit(X, y)
feature_names_in_ = pipe[:-1].get_feature_names_out()
log_reg_feature_names = pipe[-1].feature_names_in_
assert_array_equal(feature_names_in_, log_reg_feature_names)
def test_feature_union_set_output():
"""Test feature union with set_output API."""
pd = pytest.importorskip("pandas")
X, _ = load_iris(as_frame=True, return_X_y=True)
X_train, X_test = train_test_split(X, random_state=0)
union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())])
union.set_output(transform="pandas")
union.fit(X_train)
X_trans = union.transform(X_test)
assert isinstance(X_trans, pd.DataFrame)
assert_array_equal(X_trans.columns, union.get_feature_names_out())
assert_array_equal(X_trans.index, X_test.index)
def test_feature_union_getitem():
"""Check FeatureUnion.__getitem__ returns expected results."""
scalar = StandardScaler()
pca = PCA()
union = FeatureUnion(
[
("scalar", scalar),
("pca", pca),
("pass", "passthrough"),
("drop_me", "drop"),
]
)
assert union["scalar"] is scalar
assert union["pca"] is pca
assert union["pass"] == "passthrough"
assert union["drop_me"] == "drop"
@pytest.mark.parametrize("key", [0, slice(0, 2)])
def test_feature_union_getitem_error(key):
"""Raise error when __getitem__ gets a non-string input."""
union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())])
msg = "Only string keys are supported"
with pytest.raises(KeyError, match=msg):
union[key]
def test_feature_union_feature_names_in_():
"""Ensure feature union has `.feature_names_in_` attribute if `X` has a
`columns` attribute.
Test for #24754.
"""
pytest.importorskip("pandas")
X, _ = load_iris(as_frame=True, return_X_y=True)
# FeatureUnion should have the feature_names_in_ attribute if the
# first transformer also has it
scaler = StandardScaler()
scaler.fit(X)
union = FeatureUnion([("scale", scaler)])
assert hasattr(union, "feature_names_in_")
assert_array_equal(X.columns, union.feature_names_in_)
assert_array_equal(scaler.feature_names_in_, union.feature_names_in_)
# fit with pandas.DataFrame
union = FeatureUnion([("pass", "passthrough")])
union.fit(X)
assert hasattr(union, "feature_names_in_")
assert_array_equal(X.columns, union.feature_names_in_)
# fit with numpy array
X_array = X.to_numpy()
union = FeatureUnion([("pass", "passthrough")])
union.fit(X_array)
assert not hasattr(union, "feature_names_in_")
# transform_input tests
# =====================
@config_context(enable_metadata_routing=True)
@pytest.mark.parametrize("method", ["fit", "fit_transform"])
def test_transform_input_pipeline(method):
"""Test that with transform_input, data is correctly transformed for each step."""
def get_transformer(registry, sample_weight, metadata):
"""Get a transformer with requests set."""
return (
ConsumingTransformer(registry=registry)
.set_fit_request(sample_weight=sample_weight, metadata=metadata)
.set_transform_request(sample_weight=sample_weight, metadata=metadata)
)
def get_pipeline():
"""Get a pipeline and corresponding registries.
The pipeline has 4 steps, with different request values set to test different
cases. One is aliased.
"""
registry_1, registry_2, registry_3, registry_4 = (
_Registry(),
_Registry(),
_Registry(),
_Registry(),
)
pipe = make_pipeline(
get_transformer(registry_1, sample_weight=True, metadata=True),
get_transformer(registry_2, sample_weight=False, metadata=False),
get_transformer(registry_3, sample_weight=True, metadata=True),
get_transformer(registry_4, sample_weight="other_weights", metadata=True),
transform_input=["sample_weight"],
)
return pipe, registry_1, registry_2, registry_3, registry_4
def check_metadata(registry, methods, **metadata):
"""Check that the right metadata was recorded for the given methods."""
assert registry
for estimator in registry:
for method in methods:
check_recorded_metadata(
estimator,
method=method,
parent=method,
**metadata,
)
X = np.array([[1, 2], [3, 4]])
y = np.array([0, 1])
sample_weight = np.array([[1, 2]])
other_weights = np.array([[30, 40]])
metadata = np.array([[100, 200]])
pipe, registry_1, registry_2, registry_3, registry_4 = get_pipeline()
pipe.fit(
X,
y,
sample_weight=sample_weight,
other_weights=other_weights,
metadata=metadata,
)
check_metadata(
registry_1, ["fit", "transform"], sample_weight=sample_weight, metadata=metadata
)
check_metadata(registry_2, ["fit", "transform"])
check_metadata(
registry_3,
["fit", "transform"],
sample_weight=sample_weight + 2,
metadata=metadata,
)
check_metadata(
registry_4,
method.split("_"), # ["fit", "transform"] if "fit_transform", ["fit"] otherwise
sample_weight=other_weights + 3,
metadata=metadata,
)
@config_context(enable_metadata_routing=True)
def test_transform_input_explicit_value_check():
"""Test that the right transformed values are passed to `fit`."""
class Transformer(TransformerMixin, BaseEstimator):
def fit(self, X, y):
self.fitted_ = True
return self
def transform(self, X):
return X + 1
class Estimator(ClassifierMixin, BaseEstimator):
def fit(self, X, y, X_val=None, y_val=None):
assert_array_equal(X, np.array([[1, 2]]))
assert_array_equal(y, np.array([0, 1]))
assert_array_equal(X_val, np.array([[2, 3]]))
assert_array_equal(y_val, np.array([0, 1]))
return self
X = np.array([[0, 1]])
y = np.array([0, 1])
X_val = np.array([[1, 2]])
y_val = np.array([0, 1])
pipe = Pipeline(
[
("transformer", Transformer()),
("estimator", Estimator().set_fit_request(X_val=True, y_val=True)),
],
transform_input=["X_val"],
)
pipe.fit(X, y, X_val=X_val, y_val=y_val)
def test_transform_input_no_slep6():
"""Make sure the right error is raised if slep6 is not enabled."""
X = np.array([[1, 2], [3, 4]])
y = np.array([0, 1])
msg = "The `transform_input` parameter can only be set if metadata"
with pytest.raises(ValueError, match=msg):
make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y)
@config_context(enable_metadata_routing=True)
def test_transform_tuple_input():
"""Test that if metadata is a tuple of arrays, both arrays are transformed."""
class Estimator(ClassifierMixin, BaseEstimator):
def fit(self, X, y, X_val=None, y_val=None):
assert isinstance(X_val, tuple)
assert isinstance(y_val, tuple)
# Here we make sure that each X_val is transformed by the transformer
assert_array_equal(X_val[0], np.array([[2, 3]]))
assert_array_equal(y_val[0], np.array([0, 1]))
assert_array_equal(X_val[1], np.array([[11, 12]]))
assert_array_equal(y_val[1], np.array([1, 2]))
self.fitted_ = True
return self
class Transformer(TransformerMixin, BaseEstimator):
def fit(self, X, y):
self.fitted_ = True
return self
def transform(self, X):
return X + 1
X = np.array([[1, 2]])
y = np.array([0, 1])
X_val0 = np.array([[1, 2]])
y_val0 = np.array([0, 1])
X_val1 = np.array([[10, 11]])
y_val1 = np.array([1, 2])
pipe = Pipeline(
[
("transformer", Transformer()),
("estimator", Estimator().set_fit_request(X_val=True, y_val=True)),
],
transform_input=["X_val"],
)
pipe.fit(X, y, X_val=(X_val0, X_val1), y_val=(y_val0, y_val1))
# end of transform_input tests
# =============================
# TODO(1.8): change warning to checking for NotFittedError
@pytest.mark.parametrize(
"method",
[
"predict",
"predict_proba",
"predict_log_proba",
"decision_function",
"score",
"score_samples",
"transform",
"inverse_transform",
],
)
def test_pipeline_warns_not_fitted(method):
class StatelessEstimator(BaseEstimator):
"""Stateless estimator that doesn't check if it's fitted.
Stateless estimators that don't require fit, should properly set the
`requires_fit` flag and implement a `__sklearn_check_is_fitted__` returning
`True`.
"""
def fit(self, X, y):
return self # pragma: no cover
def transform(self, X):
return X
def predict(self, X):
return np.ones(len(X))
def predict_proba(self, X):
return np.ones(len(X))
def predict_log_proba(self, X):
return np.zeros(len(X))
def decision_function(self, X):
return np.ones(len(X))
def score(self, X, y):
return 1
def score_samples(self, X):
return np.ones(len(X))
def inverse_transform(self, X):
return X
pipe = Pipeline([("estimator", StatelessEstimator())])
with pytest.warns(FutureWarning, match="This Pipeline instance is not fitted yet."):
getattr(pipe, method)([[1]])
# Test that metadata is routed correctly for pipelines and FeatureUnion
# =====================================================================
class SimpleEstimator(BaseEstimator):
# This class is used in this section for testing routing in the pipeline.
# This class should have every set_{method}_request
def __sklearn_is_fitted__(self):
return True
def fit(self, X, y, sample_weight=None, prop=None):
assert sample_weight is not None, sample_weight
assert prop is not None, prop
return self
def fit_transform(self, X, y, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return X + 1
def fit_predict(self, X, y, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return np.ones(len(X))
def predict(self, X, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return np.ones(len(X))
def predict_proba(self, X, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return np.ones(len(X))
def predict_log_proba(self, X, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return np.zeros(len(X))
def decision_function(self, X, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return np.ones(len(X))
def score(self, X, y, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return 1
def transform(self, X, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return X + 1
def inverse_transform(self, X, sample_weight=None, prop=None):
assert sample_weight is not None
assert prop is not None
return X - 1
# split and partial_fit not relevant for pipelines
@pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"}))
@config_context(enable_metadata_routing=True)
def test_metadata_routing_for_pipeline(method):
"""Test that metadata is routed correctly for pipelines."""
def set_request(est, method, **kwarg):
"""Set requests for a given method.
If the given method is a composite method, set the same requests for
all the methods that compose it.
"""
if method in COMPOSITE_METHODS:
methods = COMPOSITE_METHODS[method]
else:
methods = [method]
for method in methods:
getattr(est, f"set_{method}_request")(**kwarg)
return est
X, y = np.array([[1]]), np.array([1])
sample_weight, prop, metadata = [1], "a", "b"
# test that metadata is routed correctly for pipelines when requested
est = SimpleEstimator()
est = set_request(est, method, sample_weight=True, prop=True)
est = set_request(est, "fit", sample_weight=True, prop=True)
trs = (
ConsumingTransformer()
.set_fit_request(sample_weight=True, metadata=True)
.set_transform_request(sample_weight=True, metadata=True)
.set_inverse_transform_request(sample_weight=True, metadata=True)
)
pipeline = Pipeline([("trs", trs), ("estimator", est)])
if "fit" not in method:
pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop)
try:
getattr(pipeline, method)(
X, y, sample_weight=sample_weight, prop=prop, metadata=metadata
)
except TypeError:
# Some methods don't accept y
getattr(pipeline, method)(
X, sample_weight=sample_weight, prop=prop, metadata=metadata
)
# Make sure the transformer has received the metadata
# For the transformer, always only `fit` and `transform` are called.
check_recorded_metadata(
obj=trs,
method="fit",
parent="fit",
sample_weight=sample_weight,
metadata=metadata,
)
check_recorded_metadata(
obj=trs,
method="transform",
parent="transform",
sample_weight=sample_weight,
metadata=metadata,
)
# split and partial_fit not relevant for pipelines
# sorted is here needed to make `pytest -nX` work. W/o it, tests are collected
# in different orders between workers and that makes it fail.
@pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"}))
@config_context(enable_metadata_routing=True)
def test_metadata_routing_error_for_pipeline(method):
"""Test that metadata is not routed for pipelines when not requested."""
X, y = [[1]], [1]
sample_weight, prop = [1], "a"
est = SimpleEstimator()
# here not setting sample_weight request and leaving it as None
pipeline = Pipeline([("estimator", est)])
error_message = (
"[sample_weight, prop] are passed but are not explicitly set as requested"
f" or not requested for SimpleEstimator.{method}"
)
with pytest.raises(ValueError, match=re.escape(error_message)):
try:
# passing X, y positional as the first two arguments
getattr(pipeline, method)(X, y, sample_weight=sample_weight, prop=prop)
except TypeError:
# not all methods accept y (like `predict`), so here we only
# pass X as a positional arg.
getattr(pipeline, method)(X, sample_weight=sample_weight, prop=prop)
@pytest.mark.parametrize(
"method", ["decision_function", "transform", "inverse_transform"]
)
def test_routing_passed_metadata_not_supported(method):
"""Test that the right error message is raised when metadata is passed while
not supported when `enable_metadata_routing=False`."""
pipe = Pipeline([("estimator", SimpleEstimator())])
with pytest.raises(
ValueError, match="is only supported if enable_metadata_routing=True"
):
getattr(pipe, method)([[1]], sample_weight=[1], prop="a")
@config_context(enable_metadata_routing=True)
def test_pipeline_with_estimator_with_len():
"""Test that pipeline works with estimators that have a `__len__` method."""
pipe = Pipeline(
[("trs", RandomTreesEmbedding()), ("estimator", RandomForestClassifier())]
)
pipe.fit([[1]], [1])
pipe.predict([[1]])
@pytest.mark.parametrize("last_step", [None, "passthrough"])
@config_context(enable_metadata_routing=True)
def test_pipeline_with_no_last_step(last_step):
"""Test that the pipeline works when there is not last step.
It should just ignore and pass through the data on transform.
"""
pipe = Pipeline([("trs", FunctionTransformer()), ("estimator", last_step)])
assert pipe.fit([[1]], [1]).transform([[1], [2], [3]]) == [[1], [2], [3]]
@config_context(enable_metadata_routing=True)
def test_feature_union_metadata_routing_error():
"""Test that the right error is raised when metadata is not requested."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
# test lacking set_fit_request
feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())])
error_message = (
"[sample_weight, metadata] are passed but are not explicitly set as requested"
f" or not requested for {ConsumingTransformer.__name__}.fit"
)
with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)):
feature_union.fit(X, y, sample_weight=sample_weight, metadata=metadata)
# test lacking set_transform_request
feature_union = FeatureUnion(
[
(
"sub_transformer",
ConsumingTransformer().set_fit_request(
sample_weight=True, metadata=True
),
)
]
)
error_message = (
"[sample_weight, metadata] are passed but are not explicitly set as requested "
f"or not requested for {ConsumingTransformer.__name__}.transform"
)
with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)):
feature_union.fit(
X, y, sample_weight=sample_weight, metadata=metadata
).transform(X, sample_weight=sample_weight, metadata=metadata)
@config_context(enable_metadata_routing=True)
def test_feature_union_get_metadata_routing_without_fit():
"""Test that get_metadata_routing() works regardless of the Child's
consumption of any metadata."""
feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())])
feature_union.get_metadata_routing()
@config_context(enable_metadata_routing=True)
@pytest.mark.parametrize(
"transformer", [ConsumingTransformer, ConsumingNoFitTransformTransformer]
)
def test_feature_union_metadata_routing(transformer):
"""Test that metadata is routed correctly for FeatureUnion."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
feature_union = FeatureUnion(
[
(
"sub_trans1",
transformer(registry=_Registry())
.set_fit_request(sample_weight=True, metadata=True)
.set_transform_request(sample_weight=True, metadata=True),
),
(
"sub_trans2",
transformer(registry=_Registry())
.set_fit_request(sample_weight=True, metadata=True)
.set_transform_request(sample_weight=True, metadata=True),
),
]
)
kwargs = {"sample_weight": sample_weight, "metadata": metadata}
feature_union.fit(X, y, **kwargs)
feature_union.fit_transform(X, y, **kwargs)
feature_union.fit(X, y, **kwargs).transform(X, **kwargs)
for transformer in feature_union.transformer_list:
# access sub-transformer in (name, trans) with transformer[1]
registry = transformer[1].registry
assert len(registry)
for sub_trans in registry:
check_recorded_metadata(
obj=sub_trans,
method="fit",
parent="fit",
**kwargs,
)
# End of routing tests
# ====================