""" Test the pipeline module. """ import itertools import re import shutil import time from tempfile import mkdtemp import joblib import numpy as np import pytest from sklearn import config_context from sklearn.base import ( BaseEstimator, ClassifierMixin, TransformerMixin, clone, is_classifier, is_regressor, ) from sklearn.cluster import KMeans from sklearn.datasets import load_iris from sklearn.decomposition import PCA, TruncatedSVD from sklearn.dummy import DummyRegressor from sklearn.ensemble import ( HistGradientBoostingClassifier, RandomForestClassifier, RandomTreesEmbedding, ) from sklearn.exceptions import NotFittedError, UnsetMetadataPassedError from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectKBest, f_classif from sklearn.impute import SimpleImputer from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression from sklearn.metrics import accuracy_score, r2_score from sklearn.model_selection import train_test_split from sklearn.neighbors import LocalOutlierFactor from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer, StandardScaler from sklearn.svm import SVC from sklearn.tests.metadata_routing_common import ( ConsumingNoFitTransformTransformer, ConsumingTransformer, _Registry, check_recorded_metadata, ) from sklearn.utils import get_tags from sklearn.utils._metadata_requests import COMPOSITE_METHODS, METHODS from sklearn.utils._testing import ( MinimalClassifier, MinimalRegressor, MinimalTransformer, assert_allclose, assert_array_almost_equal, assert_array_equal, ) from sklearn.utils.fixes import CSR_CONTAINERS from sklearn.utils.validation import _check_feature_names, check_is_fitted # Load a shared tests data sets for the tests in this module. Mark them # read-only to avoid unintentional in-place modifications that would introduce # side-effects between tests. iris = load_iris() iris.data.flags.writeable = False iris.target.flags.writeable = False JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", "the coke burger burger", ) class NoFit(BaseEstimator): """Small class to test parameter dispatching.""" def __init__(self, a=None, b=None): self.a = a self.b = b class NoTrans(NoFit): def fit(self, X, y=None): return self def get_params(self, deep=False): return {"a": self.a, "b": self.b} def set_params(self, **params): self.a = params["a"] return self class NoInvTransf(TransformerMixin, NoTrans): def transform(self, X): return X class Transf(NoInvTransf): def transform(self, X): return X def inverse_transform(self, X): return X class TransfFitParams(Transf): def fit(self, X, y=None, **fit_params): self.fit_params = fit_params return self class Mult(TransformerMixin, BaseEstimator): def __init__(self, mult=1): self.mult = mult def __sklearn_is_fitted__(self): return True def fit(self, X, y=None): return self def transform(self, X): return np.asarray(X) * self.mult def inverse_transform(self, X): return np.asarray(X) / self.mult def predict(self, X): return (np.asarray(X) * self.mult).sum(axis=1) predict_proba = predict_log_proba = decision_function = predict def score(self, X, y=None): return np.sum(X) class FitParamT(BaseEstimator): """Mock classifier""" def __init__(self): self.successful = False def fit(self, X, y, should_succeed=False): self.successful = should_succeed self.fitted_ = True def predict(self, X): return self.successful def fit_predict(self, X, y, should_succeed=False): self.fit(X, y, should_succeed=should_succeed) return self.predict(X) def score(self, X, y=None, sample_weight=None): if sample_weight is not None: X = X * sample_weight return np.sum(X) class DummyTransf(Transf): """Transformer which store the column means""" def fit(self, X, y): self.means_ = np.mean(X, axis=0) # store timestamp to figure out whether the result of 'fit' has been # cached or not self.timestamp_ = time.time() return self class DummyEstimatorParams(BaseEstimator): """Mock classifier that takes params on predict""" def __sklearn_is_fitted__(self): return True def fit(self, X, y): return self def predict(self, X, got_attribute=False): self.got_attribute = got_attribute return self def predict_proba(self, X, got_attribute=False): self.got_attribute = got_attribute return self def predict_log_proba(self, X, got_attribute=False): self.got_attribute = got_attribute return self def test_pipeline_invalid_parameters(): # Test the various init parameters of the pipeline in fit # method pipeline = Pipeline([(1, 1)]) with pytest.raises(TypeError): pipeline.fit([[1]], [1]) # Check that we can't fit pipelines with objects without fit # method msg = ( "Last step of Pipeline should implement fit " "or be the string 'passthrough'" ".*NoFit.*" ) pipeline = Pipeline([("clf", NoFit())]) with pytest.raises(TypeError, match=msg): pipeline.fit([[1]], [1]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([("svc", clf)]) assert pipe.get_params(deep=True) == dict( svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) ) # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([("anova", filter1), ("svc", clf)]) # Check that estimators are not cloned on pipeline construction assert pipe.named_steps["anova"] is filter1 assert pipe.named_steps["svc"] is clf # Check that we can't fit with non-transformers on the way # Note that NoTrans implements fit, but not transform msg = "All intermediate steps should be transformers.*\\bNoTrans\\b.*" pipeline = Pipeline([("t", NoTrans()), ("svc", clf)]) with pytest.raises(TypeError, match=msg): pipeline.fit([[1]], [1]) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong msg = re.escape( "Invalid parameter 'C' for estimator SelectKBest(). Valid parameters are: ['k'," " 'score_func']." ) with pytest.raises(ValueError, match=msg): pipe.set_params(anova__C=0.1) # Test clone pipe2 = clone(pipe) assert pipe.named_steps["svc"] is not pipe2.named_steps["svc"] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop("svc") params.pop("anova") params2.pop("svc") params2.pop("anova") assert params == params2 def test_empty_pipeline(): X = iris.data y = iris.target pipe = Pipeline([]) msg = "The pipeline is empty. Please add steps." with pytest.raises(ValueError, match=msg): pipe.fit(X, y) def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf="passthrough") pipe.fit(X, y=None) pipe.score(X) def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps["transf"].a is None assert pipe.named_steps["transf"].b is None # invalid parameters should raise an error message msg = re.escape("fit() got an unexpected keyword argument 'bad'") with pytest.raises(TypeError, match=msg): pipe.fit(None, None, clf__bad=True) def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8 def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 msg = re.escape("score() got an unexpected keyword argument 'sample_weight'") with pytest.raises(TypeError, match=msg): pipe.score(X, sample_weight=np.array([2, 3])) def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([("cls", LinearRegression())]) # expected error message error_msg = re.escape( "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls'," " LinearRegression())]). Valid parameters are: ['memory', 'steps'," " 'transform_input', 'verbose']." ) with pytest.raises(ValueError, match=error_msg): pipe.set_params(fake="nope") # invalid outer parameter name for compound parameter: the expected error message # is the same as above. with pytest.raises(ValueError, match=error_msg): pipe.set_params(fake__estimator="nope") # expected error message for invalid inner parameter error_msg = re.escape( "Invalid parameter 'invalid_param' for estimator LinearRegression(). Valid" " parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive', 'tol']." ) with pytest.raises(ValueError, match=error_msg): pipe.set_params(cls__invalid_param="nope") def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA(svd_solver="full", n_components="mle", whiten=True) pipe = Pipeline([("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_score_samples_pca_lof(): X = iris.data # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. pca = PCA(svd_solver="full", n_components="mle", whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([("pca", pca), ("lof", lof)]) pipe.fit(X) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) def test_score_samples_on_pipeline_without_score_samples(): X = np.array([[1], [2]]) y = np.array([1, 2]) # Test that a pipeline does not have score_samples method when the final # step of the pipeline does not have score_samples defined. pipe = make_pipeline(LogisticRegression()) pipe.fit(X, y) inner_msg = "'LogisticRegression' object has no attribute 'score_samples'" outer_msg = "'Pipeline' has no attribute 'score_samples'" with pytest.raises(AttributeError, match=outer_msg) as exec_info: pipe.score_samples(X) assert isinstance(exec_info.value.__cause__, AttributeError) assert inner_msg in str(exec_info.value.__cause__) def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape="ovr") for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples,) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y) def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately scaler = StandardScaler() km = KMeans(random_state=0, n_init="auto") # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0, n_init="auto") # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred) def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver="full") pipe = Pipeline([("scaler", scaler), ("pca", pca)]) outer_msg = "'Pipeline' has no attribute 'fit_predict'" inner_msg = "'PCA' object has no attribute 'fit_predict'" with pytest.raises(AttributeError, match=outer_msg) as exec_info: getattr(pipe, "fit_predict") assert isinstance(exec_info.value.__cause__, AttributeError) assert inner_msg in str(exec_info.value.__cause__) def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())]) pipe.fit_predict( X=None, y=None, transf__should_get_this=True, clf__should_succeed=True ) assert pipe.named_steps["transf"].fit_params["should_get_this"] assert pipe.named_steps["clf"].successful assert "should_succeed" not in pipe.named_steps["transf"].fit_params @pytest.mark.parametrize( "method_name", ["predict", "predict_proba", "predict_log_proba"] ) def test_predict_methods_with_predict_params(method_name): # tests that Pipeline passes predict_* to the final estimator # when predict_* is invoked pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())]) pipe.fit(None, None) method = getattr(pipe, method_name) method(X=None, got_attribute=True) assert pipe.named_steps["clf"].got_attribute @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_feature_union(csr_container): # basic sanity check for feature union X = iris.data.copy() X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert X_transformed.shape == (X.shape[0], 3) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = csr_container(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = clone(fs) assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1] # test setting parameters fs.set_params(select__k=2) assert fs.fit_transform(X, y).shape == (X.shape[0], 4) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert X_transformed.shape == (X.shape[0], 8) # test error if some elements do not support transform msg = "All estimators should implement fit and transform.*\\bNoTrans\\b" fs = FeatureUnion([("transform", Transf()), ("no_transform", NoTrans())]) with pytest.raises(TypeError, match=msg): fs.fit(X) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y) def test_feature_union_named_transformers(): """Check the behaviour of `named_transformers` attribute.""" transf = Transf() noinvtransf = NoInvTransf() fs = FeatureUnion([("transf", transf), ("noinvtransf", noinvtransf)]) assert fs.named_transformers["transf"] == transf assert fs.named_transformers["noinvtransf"] == noinvtransf # test named attribute assert fs.named_transformers.transf == transf assert fs.named_transformers.noinvtransf == noinvtransf def test_make_union(): pca = PCA(svd_solver="full") mock = Transf() fu = make_union(pca, mock) names, transformers = zip(*fu.transformer_list) assert names == ("pca", "transf") assert transformers == (pca, mock) def test_make_union_kwargs(): pca = PCA(svd_solver="full") mock = Transf() fu = make_union(pca, mock, n_jobs=3) assert fu.transformer_list == make_union(pca, mock).transformer_list assert 3 == fu.n_jobs # invalid keyword parameters should raise an error message msg = re.escape( "make_union() got an unexpected keyword argument 'transformer_weights'" ) with pytest.raises(TypeError, match=msg): make_union(pca, mock, transformer_weights={"pca": 10, "Transf": 1}) def create_mock_transformer(base_name, n_features=3): """Helper to create a mock transformer with custom feature names.""" mock = Transf() mock.get_feature_names_out = lambda input_features: [ f"{base_name}{i}" for i in range(n_features) ] return mock def test_make_union_passes_verbose_feature_names_out(): # Test that make_union passes verbose_feature_names_out # to the FeatureUnion. X = iris.data y = iris.target pca = PCA() mock = create_mock_transformer("transf") union = make_union(pca, mock, verbose_feature_names_out=False) assert not union.verbose_feature_names_out fu_union = make_union(pca, mock, verbose_feature_names_out=True) fu_union.fit(X, y) assert_array_equal( [ "pca__pca0", "pca__pca1", "pca__pca2", "pca__pca3", "transf__transf0", "transf__transf1", "transf__transf2", ], fu_union.get_feature_names_out(), ) def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform X = iris.data pca = PCA(n_components=2, svd_solver="full") pipeline = Pipeline([("pca", pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2) def test_pipeline_fit_transform(): # Test whether pipeline works with a transformer missing fit_transform X = iris.data y = iris.target transf = Transf() pipeline = Pipeline([("mock", transf)]) # test fit_transform: X_trans = pipeline.fit_transform(X, y) X_trans2 = transf.fit(X, y).transform(X) assert_array_almost_equal(X_trans, X_trans2) @pytest.mark.parametrize( "start, end", [(0, 1), (0, 2), (1, 2), (1, 3), (None, 1), (1, None), (None, None)] ) def test_pipeline_slice(start, end): pipe = Pipeline( [("transf1", Transf()), ("transf2", Transf()), ("clf", FitParamT())], memory="123", verbose=True, ) pipe_slice = pipe[start:end] # Test class assert isinstance(pipe_slice, Pipeline) # Test steps assert pipe_slice.steps == pipe.steps[start:end] # Test named_steps attribute assert ( list(pipe_slice.named_steps.items()) == list(pipe.named_steps.items())[start:end] ) # Test the rest of the parameters pipe_params = pipe.get_params(deep=False) pipe_slice_params = pipe_slice.get_params(deep=False) del pipe_params["steps"] del pipe_slice_params["steps"] assert pipe_params == pipe_slice_params # Test exception msg = "Pipeline slicing only supports a step of 1" with pytest.raises(ValueError, match=msg): pipe[start:end:-1] def test_pipeline_index(): transf = Transf() clf = FitParamT() pipe = Pipeline([("transf", transf), ("clf", clf)]) assert pipe[0] == transf assert pipe["transf"] == transf assert pipe[-1] == clf assert pipe["clf"] == clf # should raise an error if slicing out of range with pytest.raises(IndexError): pipe[3] # should raise an error if indexing with wrong element name with pytest.raises(KeyError): pipe["foobar"] def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([("mock", transf1)]) assert pipeline.named_steps["mock"] is transf1 # Directly setting attr pipeline.steps = [("mock2", transf2)] assert "mock" not in pipeline.named_steps assert pipeline.named_steps["mock2"] is transf2 assert [("mock2", transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[("mock", transf1)]) assert [("mock", transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [("mock", transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[("junk", ())]) msg = re.escape( "Last step of Pipeline should implement fit or be the string 'passthrough'." ) with pytest.raises(TypeError, match=msg): pipeline.fit([[1]], [1]) msg = "This 'Pipeline' has no attribute 'fit_transform'" with pytest.raises(AttributeError, match=msg): pipeline.fit_transform([[1]], [1]) def test_pipeline_named_steps(): transf = Transf() mult2 = Mult(mult=2) pipeline = Pipeline([("mock", transf), ("mult", mult2)]) # Test access via named_steps bunch object assert "mock" in pipeline.named_steps assert "mock2" not in pipeline.named_steps assert pipeline.named_steps.mock is transf assert pipeline.named_steps.mult is mult2 # Test bunch with conflict attribute of dict pipeline = Pipeline([("values", transf), ("mult", mult2)]) assert pipeline.named_steps.values is not transf assert pipeline.named_steps.mult is mult2 @pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) pipeline = Pipeline( [("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)] ) pipeline.fit(X, y) expected_names = ["m2", "bad", "m3", "m5"] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names @pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_set_pipeline_step_passthrough(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert pipeline.get_params(deep=True) == { "steps": pipeline.steps, "m2": mult2, "m3": passthrough, "last": mult5, "memory": None, "m2__mult": 2, "last__mult": 5, "transform_input": None, "verbose": False, } pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = [ "predict_proba", "predict_log_proba", "decision_function", "transform", "score", ] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) inner_msg = "'str' object has no attribute 'predict'" outer_msg = "This 'Pipeline' has no attribute 'predict'" with pytest.raises(AttributeError, match=outer_msg) as exec_info: getattr(pipeline, "predict") assert isinstance(exec_info.value.__cause__, AttributeError) assert inner_msg in str(exec_info.value.__cause__) # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform pipeline = make_pipeline("passthrough") assert pipeline.steps[0] == ("passthrough", "passthrough") assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, "predict") pipeline.transform assert not hasattr(pipeline, "inverse_transform") pipeline = make_pipeline(NoInvTransf(), Transf()) assert not hasattr(pipeline, "predict") pipeline.transform assert not hasattr(pipeline, "inverse_transform") def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" pipe = make_pipeline(t1, t2, FitParamT()) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" assert pipe.steps[2][0] == "fitparamt" @pytest.mark.parametrize( "pipeline, check_estimator_type", [ (make_pipeline(StandardScaler(), LogisticRegression()), is_classifier), (make_pipeline(StandardScaler(), LinearRegression()), is_regressor), ( make_pipeline(StandardScaler()), lambda est: get_tags(est).estimator_type is None, ), (Pipeline([]), lambda est: est._estimator_type is None), ], ) def test_pipeline_estimator_type(pipeline, check_estimator_type): """Check that the estimator type returned by the pipeline is correct. Non-regression test as part of: https://github.com/scikit-learn/scikit-learn/issues/30197 """ # Smoke test the repr repr(pipeline) assert check_estimator_type(pipeline) def test_sklearn_tags_with_empty_pipeline(): """Check that we propagate properly the tags in a Pipeline. Non-regression test as part of: https://github.com/scikit-learn/scikit-learn/issues/30197 """ empty_pipeline = Pipeline(steps=[]) be = BaseEstimator() expected_tags = be.__sklearn_tags__() assert empty_pipeline.__sklearn_tags__() == expected_tags def test_feature_union_weights(): # test feature union with transformer weights X = iris.data y = iris.target pca = PCA(n_components=2, svd_solver="randomized", random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion( [("pca", pca), ("select", select)], transformer_weights={"pca": 10} ) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion( [("pca", pca), ("select", select)], transformer_weights={"pca": 10} ) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion( [("mock", Transf()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}, ) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert X_fit_transformed_wo_method.shape == (X.shape[0], 7) def test_feature_union_parallel(): # test that n_jobs work for FeatureUnion X = JUNK_FOOD_DOCS fs = FeatureUnion( [ ("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char")), ] ) fs_parallel = FeatureUnion( [ ("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char")), ], n_jobs=2, ) fs_parallel2 = FeatureUnion( [ ("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char")), ], n_jobs=2, ) fs.fit(X) X_transformed = fs.transform(X) assert X_transformed.shape[0] == len(X) fs_parallel.fit(X) X_transformed_parallel = fs_parallel.transform(X) assert X_transformed.shape == X_transformed_parallel.shape assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray()) # fit_transform should behave the same X_transformed_parallel2 = fs_parallel2.fit_transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) # transformers should stay fit after fit_transform X_transformed_parallel2 = fs_parallel2.transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names_out() for feat in feature_names: assert "chars__" in feat or "words__" in feat assert len(feature_names) == 35 ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) msg = re.escape( "Transformer tr1 (type Transf) does not provide get_feature_names_out" ) with pytest.raises(AttributeError, match=msg): ft.get_feature_names_out() def test_classes_property(): X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) with pytest.raises(AttributeError): getattr(reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) with pytest.raises(AttributeError): getattr(clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y)) def test_set_feature_union_steps(): mult2 = Mult(2) mult3 = Mult(3) mult5 = Mult(5) mult3.get_feature_names_out = lambda input_features: ["x3"] mult2.get_feature_names_out = lambda input_features: ["x2"] mult5.get_feature_names_out = lambda input_features: ["x5"] ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out()) # Directly setting attr ft.transformer_list = [("m5", mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_array_equal(["m5__x5"], ft.get_feature_names_out()) # Using set_params ft.set_params(transformer_list=[("mock", mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_array_equal(["mock__x3"], ft.get_feature_names_out()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_array_equal(["mock__x5"], ft.get_feature_names_out()) def test_set_feature_union_step_drop(): mult2 = Mult(2) mult3 = Mult(3) mult2.get_feature_names_out = lambda input_features: ["x2"] mult3.get_feature_names_out = lambda input_features: ["x3"] X = np.asarray([[1]]) ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out()) ft.set_params(m2="drop") assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_array_equal(["m3__x3"], ft.get_feature_names_out()) ft.set_params(m3="drop") assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_array_equal([], ft.get_feature_names_out()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) # Check 'drop' step at construction time ft = FeatureUnion([("m2", "drop"), ("m3", mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_array_equal(["m3__x3"], ft.get_feature_names_out()) def test_set_feature_union_passthrough(): """Check the behaviour of setting a transformer to `"passthrough"`.""" mult2 = Mult(2) mult3 = Mult(3) # We only test get_features_names_out, as get_feature_names is unsupported by # FunctionTransformer, and hence unsupported by FeatureUnion passthrough. mult2.get_feature_names_out = lambda input_features: ["x2"] mult3.get_feature_names_out = lambda input_features: ["x3"] X = np.asarray([[1]]) ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out()) ft.set_params(m2="passthrough") assert_array_equal([[1, 3]], ft.fit(X).transform(X)) assert_array_equal([[1, 3]], ft.fit_transform(X)) assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"])) ft.set_params(m3="passthrough") assert_array_equal([[1, 1]], ft.fit(X).transform(X)) assert_array_equal([[1, 1]], ft.fit_transform(X)) assert_array_equal( ["m2__myfeat", "m3__myfeat"], ft.get_feature_names_out(["myfeat"]) ) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[1, 3]], ft.fit(X).transform(X)) assert_array_equal([[1, 3]], ft.fit_transform(X)) assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"])) # Check 'passthrough' step at construction time ft = FeatureUnion([("m2", "passthrough"), ("m3", mult3)]) assert_array_equal([[1, 3]], ft.fit(X).transform(X)) assert_array_equal([[1, 3]], ft.fit_transform(X)) assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"])) X = iris.data columns = X.shape[1] pca = PCA(n_components=2, svd_solver="randomized", random_state=0) ft = FeatureUnion([("passthrough", "passthrough"), ("pca", pca)]) assert_array_equal(X, ft.fit(X).transform(X)[:, :columns]) assert_array_equal(X, ft.fit_transform(X)[:, :columns]) assert_array_equal( [ "passthrough__f0", "passthrough__f1", "passthrough__f2", "passthrough__f3", "pca__pca0", "pca__pca1", ], ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), ) ft.set_params(pca="passthrough") X_ft = ft.fit(X).transform(X) assert_array_equal(X_ft, np.hstack([X, X])) X_ft = ft.fit_transform(X) assert_array_equal(X_ft, np.hstack([X, X])) assert_array_equal( [ "passthrough__f0", "passthrough__f1", "passthrough__f2", "passthrough__f3", "pca__f0", "pca__f1", "pca__f2", "pca__f3", ], ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), ) ft.set_params(passthrough=pca) assert_array_equal(X, ft.fit(X).transform(X)[:, -columns:]) assert_array_equal(X, ft.fit_transform(X)[:, -columns:]) assert_array_equal( [ "passthrough__pca0", "passthrough__pca1", "pca__f0", "pca__f1", "pca__f2", "pca__f3", ], ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), ) ft = FeatureUnion( [("passthrough", "passthrough"), ("pca", pca)], transformer_weights={"passthrough": 2}, ) assert_array_equal(X * 2, ft.fit(X).transform(X)[:, :columns]) assert_array_equal(X * 2, ft.fit_transform(X)[:, :columns]) assert_array_equal( [ "passthrough__f0", "passthrough__f1", "passthrough__f2", "passthrough__f3", "pca__pca0", "pca__pca1", ], ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), ) def test_feature_union_passthrough_get_feature_names_out_true(): """Check feature_names_out for verbose_feature_names_out=True (default)""" X = iris.data pca = PCA(n_components=2, svd_solver="randomized", random_state=0) ft = FeatureUnion([("pca", pca), ("passthrough", "passthrough")]) ft.fit(X) assert_array_equal( [ "pca__pca0", "pca__pca1", "passthrough__x0", "passthrough__x1", "passthrough__x2", "passthrough__x3", ], ft.get_feature_names_out(), ) def test_feature_union_passthrough_get_feature_names_out_false(): """Check feature_names_out for verbose_feature_names_out=False""" X = iris.data pca = PCA(n_components=2, svd_solver="randomized", random_state=0) ft = FeatureUnion( [("pca", pca), ("passthrough", "passthrough")], verbose_feature_names_out=False ) ft.fit(X) assert_array_equal( [ "pca0", "pca1", "x0", "x1", "x2", "x3", ], ft.get_feature_names_out(), ) def test_feature_union_passthrough_get_feature_names_out_false_errors(): """Check get_feature_names_out and non-verbose names and colliding names.""" pd = pytest.importorskip("pandas") X = pd.DataFrame([[1, 2], [2, 3]], columns=["a", "b"]) select_a = FunctionTransformer( lambda X: X[["a"]], feature_names_out=lambda self, _: np.asarray(["a"]) ) union = FeatureUnion( [("t1", StandardScaler()), ("t2", select_a)], verbose_feature_names_out=False, ) union.fit(X) msg = re.escape( "Output feature names: ['a'] are not unique. " "Please set verbose_feature_names_out=True to add prefixes to feature names" ) with pytest.raises(ValueError, match=msg): union.get_feature_names_out() def test_feature_union_passthrough_get_feature_names_out_false_errors_overlap_over_5(): """Check get_feature_names_out with non-verbose names and >= 5 colliding names.""" pd = pytest.importorskip("pandas") X = pd.DataFrame([list(range(10))], columns=[f"f{i}" for i in range(10)]) union = FeatureUnion( [("t1", "passthrough"), ("t2", "passthrough")], verbose_feature_names_out=False, ) union.fit(X) msg = re.escape( "Output feature names: ['f0', 'f1', 'f2', 'f3', 'f4', ...] " "are not unique. Please set verbose_feature_names_out=True to add prefixes to" " feature names" ) with pytest.raises(ValueError, match=msg): union.get_feature_names_out() def test_step_name_validation(): error_message_1 = r"Estimator names must not contain __: got \['a__q'\]" error_message_2 = r"Names provided are not unique: \['a', 'a'\]" error_message_3 = r"Estimator names conflict with constructor arguments: \['%s'\]" bad_steps1 = [("a__q", Mult(2)), ("b", Mult(3))] bad_steps2 = [("a", Mult(2)), ("a", Mult(3))] for cls, param in [(Pipeline, "steps"), (FeatureUnion, "transformer_list")]: # we validate in construction (despite scikit-learn convention) bad_steps3 = [("a", Mult(2)), (param, Mult(3))] for bad_steps, message in [ (bad_steps1, error_message_1), (bad_steps2, error_message_2), (bad_steps3, error_message_3 % param), ]: # three ways to make invalid: # - construction with pytest.raises(ValueError, match=message): cls(**{param: bad_steps}).fit([[1]], [1]) # - setattr est = cls(**{param: [("a", Mult(1))]}) setattr(est, param, bad_steps) with pytest.raises(ValueError, match=message): est.fit([[1]], [1]) with pytest.raises(ValueError, match=message): est.fit_transform([[1]], [1]) # - set_params est = cls(**{param: [("a", Mult(1))]}) est.set_params(**{param: bad_steps}) with pytest.raises(ValueError, match=message): est.fit([[1]], [1]) with pytest.raises(ValueError, match=message): est.fit_transform([[1]], [1]) def test_set_params_nested_pipeline(): estimator = Pipeline([("a", Pipeline([("b", DummyRegressor())]))]) estimator.set_params(a__b__alpha=0.001, a__b=Lasso()) estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5) def test_pipeline_memory(): X = iris.data y = iris.target cachedir = mkdtemp() try: memory = joblib.Memory(location=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_ ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_ ) assert ts == cached_pipe.named_steps["transf"].timestamp_ # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline( [("transf_2", transf_2), ("svc", clf_2)], memory=memory ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert ts == cached_pipe_2.named_steps["transf_2"].timestamp_ finally: shutil.rmtree(cachedir) def test_make_pipeline_memory(): cachedir = mkdtemp() memory = joblib.Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC()) assert pipeline.memory is None assert len(pipeline) == 2 shutil.rmtree(cachedir) class FeatureNameSaver(BaseEstimator): def fit(self, X, y=None): _check_feature_names(self, X, reset=True) return self def transform(self, X, y=None): return X def get_feature_names_out(self, input_features=None): return input_features def test_features_names_passthrough(): """Check pipeline.get_feature_names_out with passthrough""" pipe = Pipeline( steps=[ ("names", FeatureNameSaver()), ("pass", "passthrough"), ("clf", LogisticRegression()), ] ) iris = load_iris() pipe.fit(iris.data, iris.target) assert_array_equal( pipe[:-1].get_feature_names_out(iris.feature_names), iris.feature_names ) def test_feature_names_count_vectorizer(): """Check pipeline.get_feature_names_out with vectorizers""" pipe = Pipeline(steps=[("vect", CountVectorizer()), ("clf", LogisticRegression())]) y = ["pizza" in x for x in JUNK_FOOD_DOCS] pipe.fit(JUNK_FOOD_DOCS, y) assert_array_equal( pipe[:-1].get_feature_names_out(), ["beer", "burger", "coke", "copyright", "pizza", "the"], ) assert_array_equal( pipe[:-1].get_feature_names_out("nonsense_is_ignored"), ["beer", "burger", "coke", "copyright", "pizza", "the"], ) def test_pipeline_feature_names_out_error_without_definition(): """Check that error is raised when a transformer does not define `get_feature_names_out`.""" pipe = Pipeline(steps=[("notrans", NoTrans())]) iris = load_iris() pipe.fit(iris.data, iris.target) msg = "does not provide get_feature_names_out" with pytest.raises(AttributeError, match=msg): pipe.get_feature_names_out() def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises( ValueError, match="Pipeline.fit does not accept the sample_weight parameter" ): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1]) parameter_grid_test_verbose = ( (est, pattern, method) for (est, pattern), method in itertools.product( [ ( Pipeline([("transf", Transf()), ("clf", FitParamT())]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$", ), ( Pipeline([("transf", Transf()), ("noop", None), ("clf", FitParamT())]), r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n" r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$", ), ( Pipeline( [ ("transf", Transf()), ("noop", "passthrough"), ("clf", FitParamT()), ] ), r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n" r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$", ), ( Pipeline([("transf", Transf()), ("clf", None)]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$", ), ( Pipeline([("transf", None), ("mult", Mult())]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$", ), ( Pipeline([("transf", "passthrough"), ("mult", Mult())]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$", ), ( FeatureUnion([("mult1", Mult()), ("mult2", Mult())]), r"\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n" r"\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$", ), ( FeatureUnion([("mult1", "drop"), ("mult2", Mult()), ("mult3", "drop")]), r"\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$", ), ], ["fit", "fit_transform", "fit_predict"], ) if hasattr(est, method) and not ( method == "fit_transform" and hasattr(est, "steps") and isinstance(est.steps[-1][1], FitParamT) ) ) @pytest.mark.parametrize("est, pattern, method", parameter_grid_test_verbose) def test_verbose(est, method, pattern, capsys): func = getattr(est, method) X = [[1, 2, 3], [4, 5, 6]] y = [[7], [8]] est.set_params(verbose=False) func(X, y) assert not capsys.readouterr().out, "Got output for verbose=False" est.set_params(verbose=True) func(X, y) assert re.match(pattern, capsys.readouterr().out) def test_n_features_in_pipeline(): # make sure pipelines delegate n_features_in to the first step X = [[1, 2], [3, 4], [5, 6]] y = [0, 1, 2] ss = StandardScaler() gbdt = HistGradientBoostingClassifier() pipe = make_pipeline(ss, gbdt) assert not hasattr(pipe, "n_features_in_") pipe.fit(X, y) assert pipe.n_features_in_ == ss.n_features_in_ == 2 # if the first step has the n_features_in attribute then the pipeline also # has it, even though it isn't fitted. ss = StandardScaler() gbdt = HistGradientBoostingClassifier() pipe = make_pipeline(ss, gbdt) ss.fit(X, y) assert pipe.n_features_in_ == ss.n_features_in_ == 2 assert not hasattr(gbdt, "n_features_in_") def test_n_features_in_feature_union(): # make sure FeatureUnion delegates n_features_in to the first transformer X = [[1, 2], [3, 4], [5, 6]] y = [0, 1, 2] ss = StandardScaler() fu = make_union(ss) assert not hasattr(fu, "n_features_in_") fu.fit(X, y) assert fu.n_features_in_ == ss.n_features_in_ == 2 # if the first step has the n_features_in attribute then the feature_union # also has it, even though it isn't fitted. ss = StandardScaler() fu = make_union(ss) ss.fit(X, y) assert fu.n_features_in_ == ss.n_features_in_ == 2 def test_feature_union_fit_params(): # Regression test for issue: #15117 class DummyTransformer(TransformerMixin, BaseEstimator): def fit(self, X, y=None, **fit_params): if fit_params != {"a": 0}: raise ValueError return self def transform(self, X, y=None): return X X, y = iris.data, iris.target t = FeatureUnion([("dummy0", DummyTransformer()), ("dummy1", DummyTransformer())]) with pytest.raises(ValueError): t.fit(X, y) with pytest.raises(ValueError): t.fit_transform(X, y) t.fit(X, y, a=0) t.fit_transform(X, y, a=0) def test_feature_union_fit_params_without_fit_transform(): # Test that metadata is passed correctly to underlying transformers that don't # implement a `fit_transform` method when SLEP6 is not enabled. class DummyTransformer(ConsumingNoFitTransformTransformer): def fit(self, X, y=None, **fit_params): if fit_params != {"metadata": 1}: raise ValueError return self X, y = iris.data, iris.target t = FeatureUnion( [ ("nofittransform0", DummyTransformer()), ("nofittransform1", DummyTransformer()), ] ) with pytest.raises(ValueError): t.fit_transform(X, y, metadata=0) t.fit_transform(X, y, metadata=1) def test_pipeline_missing_values_leniency(): # check that pipeline let the missing values validation to # the underlying transformers and predictors. X, y = iris.data.copy(), iris.target.copy() mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool) X[mask] = np.nan pipe = make_pipeline(SimpleImputer(), LogisticRegression()) assert pipe.fit(X, y).score(X, y) > 0.4 def test_feature_union_warns_unknown_transformer_weight(): # Warn user when transformer_weights containers a key not present in # transformer_list X = [[1, 2], [3, 4], [5, 6]] y = [0, 1, 2] transformer_list = [("transf", Transf())] # Transformer weights dictionary with incorrect name weights = {"transformer": 1} expected_msg = ( 'Attempting to weight transformer "transformer", ' "but it is not present in transformer_list." ) union = FeatureUnion(transformer_list, transformer_weights=weights) with pytest.raises(ValueError, match=expected_msg): union.fit(X, y) @pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_pipeline_get_tags_none(passthrough): # Checks that tags are set correctly when the first transformer is None or # 'passthrough' # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/18815 pipe = make_pipeline(passthrough, SVC()) assert not pipe.__sklearn_tags__().input_tags.pairwise # FIXME: Replace this test with a full `check_estimator` once we have API only # checks. @pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier]) def test_search_cv_using_minimal_compatible_estimator(Predictor): # Check that third-party library estimators can be part of a pipeline # and tuned by grid-search without inheriting from BaseEstimator. rng = np.random.RandomState(0) X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20) model = Pipeline( [("transformer", MinimalTransformer()), ("predictor", Predictor())] ) model.fit(X, y) y_pred = model.predict(X) if is_classifier(model): assert_array_equal(y_pred, 1) assert model.score(X, y) == pytest.approx(accuracy_score(y, y_pred)) else: assert_allclose(y_pred, y.mean()) assert model.score(X, y) == pytest.approx(r2_score(y, y_pred)) def test_pipeline_check_if_fitted(): class Estimator(BaseEstimator): def fit(self, X, y): self.fitted_ = True return self pipeline = Pipeline([("clf", Estimator())]) with pytest.raises(NotFittedError): check_is_fitted(pipeline) pipeline.fit(iris.data, iris.target) check_is_fitted(pipeline) def test_feature_union_check_if_fitted(): """Check __sklearn_is_fitted__ is defined correctly.""" X = [[1, 2], [3, 4], [5, 6]] y = [0, 1, 2] union = FeatureUnion([("clf", MinimalTransformer())]) with pytest.raises(NotFittedError): check_is_fitted(union) union.fit(X, y) check_is_fitted(union) # passthrough is stateless union = FeatureUnion([("pass", "passthrough")]) check_is_fitted(union) union = FeatureUnion([("clf", MinimalTransformer()), ("pass", "passthrough")]) with pytest.raises(NotFittedError): check_is_fitted(union) union.fit(X, y) check_is_fitted(union) def test_pipeline_get_feature_names_out_passes_names_through(): """Check that pipeline passes names through. Non-regresion test for #21349. """ X, y = iris.data, iris.target class AddPrefixStandardScalar(StandardScaler): def get_feature_names_out(self, input_features=None): names = super().get_feature_names_out(input_features=input_features) return np.asarray([f"my_prefix_{name}" for name in names], dtype=object) pipe = make_pipeline(AddPrefixStandardScalar(), StandardScaler()) pipe.fit(X, y) input_names = iris.feature_names feature_names_out = pipe.get_feature_names_out(input_names) assert_array_equal(feature_names_out, [f"my_prefix_{name}" for name in input_names]) def test_pipeline_set_output_integration(): """Test pipeline's set_output with feature names.""" pytest.importorskip("pandas") X, y = load_iris(as_frame=True, return_X_y=True) pipe = make_pipeline(StandardScaler(), LogisticRegression()) pipe.set_output(transform="pandas") pipe.fit(X, y) feature_names_in_ = pipe[:-1].get_feature_names_out() log_reg_feature_names = pipe[-1].feature_names_in_ assert_array_equal(feature_names_in_, log_reg_feature_names) def test_feature_union_set_output(): """Test feature union with set_output API.""" pd = pytest.importorskip("pandas") X, _ = load_iris(as_frame=True, return_X_y=True) X_train, X_test = train_test_split(X, random_state=0) union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())]) union.set_output(transform="pandas") union.fit(X_train) X_trans = union.transform(X_test) assert isinstance(X_trans, pd.DataFrame) assert_array_equal(X_trans.columns, union.get_feature_names_out()) assert_array_equal(X_trans.index, X_test.index) def test_feature_union_getitem(): """Check FeatureUnion.__getitem__ returns expected results.""" scalar = StandardScaler() pca = PCA() union = FeatureUnion( [ ("scalar", scalar), ("pca", pca), ("pass", "passthrough"), ("drop_me", "drop"), ] ) assert union["scalar"] is scalar assert union["pca"] is pca assert union["pass"] == "passthrough" assert union["drop_me"] == "drop" @pytest.mark.parametrize("key", [0, slice(0, 2)]) def test_feature_union_getitem_error(key): """Raise error when __getitem__ gets a non-string input.""" union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())]) msg = "Only string keys are supported" with pytest.raises(KeyError, match=msg): union[key] def test_feature_union_feature_names_in_(): """Ensure feature union has `.feature_names_in_` attribute if `X` has a `columns` attribute. Test for #24754. """ pytest.importorskip("pandas") X, _ = load_iris(as_frame=True, return_X_y=True) # FeatureUnion should have the feature_names_in_ attribute if the # first transformer also has it scaler = StandardScaler() scaler.fit(X) union = FeatureUnion([("scale", scaler)]) assert hasattr(union, "feature_names_in_") assert_array_equal(X.columns, union.feature_names_in_) assert_array_equal(scaler.feature_names_in_, union.feature_names_in_) # fit with pandas.DataFrame union = FeatureUnion([("pass", "passthrough")]) union.fit(X) assert hasattr(union, "feature_names_in_") assert_array_equal(X.columns, union.feature_names_in_) # fit with numpy array X_array = X.to_numpy() union = FeatureUnion([("pass", "passthrough")]) union.fit(X_array) assert not hasattr(union, "feature_names_in_") # transform_input tests # ===================== @config_context(enable_metadata_routing=True) @pytest.mark.parametrize("method", ["fit", "fit_transform"]) def test_transform_input_pipeline(method): """Test that with transform_input, data is correctly transformed for each step.""" def get_transformer(registry, sample_weight, metadata): """Get a transformer with requests set.""" return ( ConsumingTransformer(registry=registry) .set_fit_request(sample_weight=sample_weight, metadata=metadata) .set_transform_request(sample_weight=sample_weight, metadata=metadata) ) def get_pipeline(): """Get a pipeline and corresponding registries. The pipeline has 4 steps, with different request values set to test different cases. One is aliased. """ registry_1, registry_2, registry_3, registry_4 = ( _Registry(), _Registry(), _Registry(), _Registry(), ) pipe = make_pipeline( get_transformer(registry_1, sample_weight=True, metadata=True), get_transformer(registry_2, sample_weight=False, metadata=False), get_transformer(registry_3, sample_weight=True, metadata=True), get_transformer(registry_4, sample_weight="other_weights", metadata=True), transform_input=["sample_weight"], ) return pipe, registry_1, registry_2, registry_3, registry_4 def check_metadata(registry, methods, **metadata): """Check that the right metadata was recorded for the given methods.""" assert registry for estimator in registry: for method in methods: check_recorded_metadata( estimator, method=method, parent=method, **metadata, ) X = np.array([[1, 2], [3, 4]]) y = np.array([0, 1]) sample_weight = np.array([[1, 2]]) other_weights = np.array([[30, 40]]) metadata = np.array([[100, 200]]) pipe, registry_1, registry_2, registry_3, registry_4 = get_pipeline() pipe.fit( X, y, sample_weight=sample_weight, other_weights=other_weights, metadata=metadata, ) check_metadata( registry_1, ["fit", "transform"], sample_weight=sample_weight, metadata=metadata ) check_metadata(registry_2, ["fit", "transform"]) check_metadata( registry_3, ["fit", "transform"], sample_weight=sample_weight + 2, metadata=metadata, ) check_metadata( registry_4, method.split("_"), # ["fit", "transform"] if "fit_transform", ["fit"] otherwise sample_weight=other_weights + 3, metadata=metadata, ) @config_context(enable_metadata_routing=True) def test_transform_input_explicit_value_check(): """Test that the right transformed values are passed to `fit`.""" class Transformer(TransformerMixin, BaseEstimator): def fit(self, X, y): self.fitted_ = True return self def transform(self, X): return X + 1 class Estimator(ClassifierMixin, BaseEstimator): def fit(self, X, y, X_val=None, y_val=None): assert_array_equal(X, np.array([[1, 2]])) assert_array_equal(y, np.array([0, 1])) assert_array_equal(X_val, np.array([[2, 3]])) assert_array_equal(y_val, np.array([0, 1])) return self X = np.array([[0, 1]]) y = np.array([0, 1]) X_val = np.array([[1, 2]]) y_val = np.array([0, 1]) pipe = Pipeline( [ ("transformer", Transformer()), ("estimator", Estimator().set_fit_request(X_val=True, y_val=True)), ], transform_input=["X_val"], ) pipe.fit(X, y, X_val=X_val, y_val=y_val) def test_transform_input_no_slep6(): """Make sure the right error is raised if slep6 is not enabled.""" X = np.array([[1, 2], [3, 4]]) y = np.array([0, 1]) msg = "The `transform_input` parameter can only be set if metadata" with pytest.raises(ValueError, match=msg): make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y) @config_context(enable_metadata_routing=True) def test_transform_tuple_input(): """Test that if metadata is a tuple of arrays, both arrays are transformed.""" class Estimator(ClassifierMixin, BaseEstimator): def fit(self, X, y, X_val=None, y_val=None): assert isinstance(X_val, tuple) assert isinstance(y_val, tuple) # Here we make sure that each X_val is transformed by the transformer assert_array_equal(X_val[0], np.array([[2, 3]])) assert_array_equal(y_val[0], np.array([0, 1])) assert_array_equal(X_val[1], np.array([[11, 12]])) assert_array_equal(y_val[1], np.array([1, 2])) self.fitted_ = True return self class Transformer(TransformerMixin, BaseEstimator): def fit(self, X, y): self.fitted_ = True return self def transform(self, X): return X + 1 X = np.array([[1, 2]]) y = np.array([0, 1]) X_val0 = np.array([[1, 2]]) y_val0 = np.array([0, 1]) X_val1 = np.array([[10, 11]]) y_val1 = np.array([1, 2]) pipe = Pipeline( [ ("transformer", Transformer()), ("estimator", Estimator().set_fit_request(X_val=True, y_val=True)), ], transform_input=["X_val"], ) pipe.fit(X, y, X_val=(X_val0, X_val1), y_val=(y_val0, y_val1)) # end of transform_input tests # ============================= # TODO(1.8): change warning to checking for NotFittedError @pytest.mark.parametrize( "method", [ "predict", "predict_proba", "predict_log_proba", "decision_function", "score", "score_samples", "transform", "inverse_transform", ], ) def test_pipeline_warns_not_fitted(method): class StatelessEstimator(BaseEstimator): """Stateless estimator that doesn't check if it's fitted. Stateless estimators that don't require fit, should properly set the `requires_fit` flag and implement a `__sklearn_check_is_fitted__` returning `True`. """ def fit(self, X, y): return self # pragma: no cover def transform(self, X): return X def predict(self, X): return np.ones(len(X)) def predict_proba(self, X): return np.ones(len(X)) def predict_log_proba(self, X): return np.zeros(len(X)) def decision_function(self, X): return np.ones(len(X)) def score(self, X, y): return 1 def score_samples(self, X): return np.ones(len(X)) def inverse_transform(self, X): return X pipe = Pipeline([("estimator", StatelessEstimator())]) with pytest.warns(FutureWarning, match="This Pipeline instance is not fitted yet."): getattr(pipe, method)([[1]]) # Test that metadata is routed correctly for pipelines and FeatureUnion # ===================================================================== class SimpleEstimator(BaseEstimator): # This class is used in this section for testing routing in the pipeline. # This class should have every set_{method}_request def __sklearn_is_fitted__(self): return True def fit(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None, sample_weight assert prop is not None, prop return self def fit_transform(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return X + 1 def fit_predict(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return np.ones(len(X)) def predict(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return np.ones(len(X)) def predict_proba(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return np.ones(len(X)) def predict_log_proba(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return np.zeros(len(X)) def decision_function(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return np.ones(len(X)) def score(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return 1 def transform(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return X + 1 def inverse_transform(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None return X - 1 # split and partial_fit not relevant for pipelines @pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"})) @config_context(enable_metadata_routing=True) def test_metadata_routing_for_pipeline(method): """Test that metadata is routed correctly for pipelines.""" def set_request(est, method, **kwarg): """Set requests for a given method. If the given method is a composite method, set the same requests for all the methods that compose it. """ if method in COMPOSITE_METHODS: methods = COMPOSITE_METHODS[method] else: methods = [method] for method in methods: getattr(est, f"set_{method}_request")(**kwarg) return est X, y = np.array([[1]]), np.array([1]) sample_weight, prop, metadata = [1], "a", "b" # test that metadata is routed correctly for pipelines when requested est = SimpleEstimator() est = set_request(est, method, sample_weight=True, prop=True) est = set_request(est, "fit", sample_weight=True, prop=True) trs = ( ConsumingTransformer() .set_fit_request(sample_weight=True, metadata=True) .set_transform_request(sample_weight=True, metadata=True) .set_inverse_transform_request(sample_weight=True, metadata=True) ) pipeline = Pipeline([("trs", trs), ("estimator", est)]) if "fit" not in method: pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop) try: getattr(pipeline, method)( X, y, sample_weight=sample_weight, prop=prop, metadata=metadata ) except TypeError: # Some methods don't accept y getattr(pipeline, method)( X, sample_weight=sample_weight, prop=prop, metadata=metadata ) # Make sure the transformer has received the metadata # For the transformer, always only `fit` and `transform` are called. check_recorded_metadata( obj=trs, method="fit", parent="fit", sample_weight=sample_weight, metadata=metadata, ) check_recorded_metadata( obj=trs, method="transform", parent="transform", sample_weight=sample_weight, metadata=metadata, ) # split and partial_fit not relevant for pipelines # sorted is here needed to make `pytest -nX` work. W/o it, tests are collected # in different orders between workers and that makes it fail. @pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"})) @config_context(enable_metadata_routing=True) def test_metadata_routing_error_for_pipeline(method): """Test that metadata is not routed for pipelines when not requested.""" X, y = [[1]], [1] sample_weight, prop = [1], "a" est = SimpleEstimator() # here not setting sample_weight request and leaving it as None pipeline = Pipeline([("estimator", est)]) error_message = ( "[sample_weight, prop] are passed but are not explicitly set as requested" f" or not requested for SimpleEstimator.{method}" ) with pytest.raises(ValueError, match=re.escape(error_message)): try: # passing X, y positional as the first two arguments getattr(pipeline, method)(X, y, sample_weight=sample_weight, prop=prop) except TypeError: # not all methods accept y (like `predict`), so here we only # pass X as a positional arg. getattr(pipeline, method)(X, sample_weight=sample_weight, prop=prop) @pytest.mark.parametrize( "method", ["decision_function", "transform", "inverse_transform"] ) def test_routing_passed_metadata_not_supported(method): """Test that the right error message is raised when metadata is passed while not supported when `enable_metadata_routing=False`.""" pipe = Pipeline([("estimator", SimpleEstimator())]) with pytest.raises( ValueError, match="is only supported if enable_metadata_routing=True" ): getattr(pipe, method)([[1]], sample_weight=[1], prop="a") @config_context(enable_metadata_routing=True) def test_pipeline_with_estimator_with_len(): """Test that pipeline works with estimators that have a `__len__` method.""" pipe = Pipeline( [("trs", RandomTreesEmbedding()), ("estimator", RandomForestClassifier())] ) pipe.fit([[1]], [1]) pipe.predict([[1]]) @pytest.mark.parametrize("last_step", [None, "passthrough"]) @config_context(enable_metadata_routing=True) def test_pipeline_with_no_last_step(last_step): """Test that the pipeline works when there is not last step. It should just ignore and pass through the data on transform. """ pipe = Pipeline([("trs", FunctionTransformer()), ("estimator", last_step)]) assert pipe.fit([[1]], [1]).transform([[1], [2], [3]]) == [[1], [2], [3]] @config_context(enable_metadata_routing=True) def test_feature_union_metadata_routing_error(): """Test that the right error is raised when metadata is not requested.""" X = np.array([[0, 1], [2, 2], [4, 6]]) y = [1, 2, 3] sample_weight, metadata = [1, 1, 1], "a" # test lacking set_fit_request feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())]) error_message = ( "[sample_weight, metadata] are passed but are not explicitly set as requested" f" or not requested for {ConsumingTransformer.__name__}.fit" ) with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)): feature_union.fit(X, y, sample_weight=sample_weight, metadata=metadata) # test lacking set_transform_request feature_union = FeatureUnion( [ ( "sub_transformer", ConsumingTransformer().set_fit_request( sample_weight=True, metadata=True ), ) ] ) error_message = ( "[sample_weight, metadata] are passed but are not explicitly set as requested " f"or not requested for {ConsumingTransformer.__name__}.transform" ) with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)): feature_union.fit( X, y, sample_weight=sample_weight, metadata=metadata ).transform(X, sample_weight=sample_weight, metadata=metadata) @config_context(enable_metadata_routing=True) def test_feature_union_get_metadata_routing_without_fit(): """Test that get_metadata_routing() works regardless of the Child's consumption of any metadata.""" feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())]) feature_union.get_metadata_routing() @config_context(enable_metadata_routing=True) @pytest.mark.parametrize( "transformer", [ConsumingTransformer, ConsumingNoFitTransformTransformer] ) def test_feature_union_metadata_routing(transformer): """Test that metadata is routed correctly for FeatureUnion.""" X = np.array([[0, 1], [2, 2], [4, 6]]) y = [1, 2, 3] sample_weight, metadata = [1, 1, 1], "a" feature_union = FeatureUnion( [ ( "sub_trans1", transformer(registry=_Registry()) .set_fit_request(sample_weight=True, metadata=True) .set_transform_request(sample_weight=True, metadata=True), ), ( "sub_trans2", transformer(registry=_Registry()) .set_fit_request(sample_weight=True, metadata=True) .set_transform_request(sample_weight=True, metadata=True), ), ] ) kwargs = {"sample_weight": sample_weight, "metadata": metadata} feature_union.fit(X, y, **kwargs) feature_union.fit_transform(X, y, **kwargs) feature_union.fit(X, y, **kwargs).transform(X, **kwargs) for transformer in feature_union.transformer_list: # access sub-transformer in (name, trans) with transformer[1] registry = transformer[1].registry assert len(registry) for sub_trans in registry: check_recorded_metadata( obj=sub_trans, method="fit", parent="fit", **kwargs, ) # End of routing tests # ====================