Skip to content

warning in test: tests/tests_automl/test_data_types.py::AutoMLDataTypesTest::test_category_data_type #750

@a-szulc

Description

@a-szulc
============================= test session starts ==============================
platform linux -- Python 3.12.3, pytest-8.3.2, pluggy-1.5.0 -- /home/adas/mljar/mljar-supervised/venv/bin/python3
cachedir: .pytest_cache
rootdir: /home/adas/mljar/mljar-supervised
configfile: pytest.ini
plugins: cov-5.0.0
collecting ... collected 1 item

tests/tests_automl/test_data_types.py::AutoMLDataTypesTest::test_category_data_type AutoML directory: automl_tests
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['CatBoost']
AutoML steps: ['simple_algorithms', 'default_algorithms']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
There was an error during 1_Default_CatBoost training.
Please check automl_tests/errors.md for details.
FAILED

=================================== FAILURES ===================================
_________________ AutoMLDataTypesTest.test_category_data_type __________________

self = <tests.tests_automl.test_data_types.AutoMLDataTypesTest testMethod=test_category_data_type>

    def test_category_data_type(self):
        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.randint(0, 2, self.rows)
    
        X["f1"] = X["f1"].astype("category")
    
        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            algorithms=["CatBoost"],
            train_ensemble=False,
            explain_level=0,
            start_random_models=1,
        )
>       automl.fit(X, y)

tests/tests_automl/test_data_types.py:34: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
supervised/automl.py:432: in fit
    return self._fit(X, y, sample_weight, cv, sensitive_features)
supervised/base_automl.py:1237: in _fit
    raise e
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = AutoML(algorithms=['CatBoost'], explain_level=0, results_path='automl_tests',
       start_random_models=1, total_time_limit=1, train_ensemble=False)
X =            f0        f1        f2
0    0.862396  0.424942  0.392733
1    0.746183  0.040427  0.492273
2    0.934285  0....501938  0.523092  0.459691
248  0.854131  0.828239  0.431476
249  0.856862  0.055555  0.973876

[250 rows x 3 columns]
y = 0      1
1      1
2      0
3      0
4      1
      ..
245    1
246    0
247    0
248    1
249    0
Name: target, Length: 250, dtype: int64
sample_weight = None, cv = None, sensitive_features = None

    def _fit(self, X, y, sample_weight=None, cv=None, sensitive_features=None):
        """Fits the AutoML model with data"""
        if self._fit_level == "finished":
            print(
                "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'."
            )
            return
        # Validate input and build dataframes
        X, y, sample_weight, sensitive_features = self._build_dataframe(
            X, y, sample_weight, sensitive_features
        )
    
        self.n_rows_in_ = X.shape[0]
        self.n_features_in_ = X.shape[1]
        self.n_classes = len(np.unique(y[~pd.isnull(y)]))
    
        # Get attributes (__init__ params)
        self._mode = self._get_mode()
        self._ml_task = self._get_ml_task()
        self._results_path = self._get_results_path()
        self._total_time_limit = self._get_total_time_limit()
        self._model_time_limit = self._get_model_time_limit()
        self._algorithms = self._get_algorithms()
        self._train_ensemble = self._get_train_ensemble()
        self._stack_models = self._get_stack_models()
        self._eval_metric = self._get_eval_metric()
        self._validation_strategy = self._get_validation_strategy()
        self._verbose = self._get_verbose()
        self._explain_level = self._get_explain_level()
        self._golden_features = self._get_golden_features()
        self._features_selection = self._get_features_selection()
        self._start_random_models = self._get_start_random_models()
        self._hill_climbing_steps = self._get_hill_climbing_steps()
        self._top_models_to_improve = self._get_top_models_to_improve()
        self._boost_on_errors = self._get_boost_on_errors()
        self._kmeans_features = self._get_kmeans_features()
        self._mix_encoding = self._get_mix_encoding()
        self._max_single_prediction_time = self._get_max_single_prediction_time()
        self._optuna_time_budget = self._get_optuna_time_budget()
        self._optuna_init_params = self._get_optuna_init_params()
        self._optuna_verbose = self._get_optuna_verbose()
        self._n_jobs = self._get_n_jobs()
        self._random_state = self._get_random_state()
    
        if sensitive_features is not None:
            self._fairness_metric = self._get_fairness_metric()
            self._fairness_threshold = self._get_fairness_threshold()
            self._privileged_groups = self._get_privileged_groups()
            self._underprivileged_groups = self._get_underprivileged_groups()
    
        self._adjust_validation = False
        self._apply_constraints()
        if not self._adjust_validation:
            # if there is no validation adjustement
            # then we can apply stack_models constraints immediately
            # if there is validation adjustement
            # then we will apply contraints after the adjustement
            self._apply_constraints_stack_models()
    
        try:
            self.load_progress()
            if self._fit_level == "finished":
                print(
                    "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'."
                )
                return
            self._check_can_load()
    
            self.verbose_print(f"AutoML directory: {self._results_path}")
            if self._mode == "Optuna":
                ttl = int(len(self._algorithms) * self._optuna_time_budget)
                self.verbose_print("Expected computing time:")
                self.verbose_print(
                    f"Time for tuning with Optuna: len(algorithms) * optuna_time_budget = {int(len(self._algorithms) * self._optuna_time_budget)} seconds"
                )
                self.verbose_print(
                    f"There is no time limit for ML model training after Optuna tuning (total_time_limit parameter is ignored)."
                )
    
            self.verbose_print(
                f"The task is {self._ml_task} with evaluation metric {self._eval_metric}"
            )
            self.verbose_print(f"AutoML will use algorithms: {self._algorithms}")
            if self._stack_models:
                self.verbose_print("AutoML will stack models")
            if self._train_ensemble:
                self.verbose_print("AutoML will ensemble available models")
    
            self._start_time = time.time()
            if self._time_ctrl is not None:
                self._start_time -= self._time_ctrl.already_spend()
    
            # Automatic Exloratory Data Analysis
            # I disabled EDA, because it won't be supported
            # I recomend use pandas_profiling or Sweetviz
            # if self._explain_level == 2:
            #     EDA.compute(X, y, os.path.join(self._results_path, "EDA"))
    
            # Save data
    
            self._save_data(
                X.copy(deep=False),
                y.copy(deep=False),
                None if sample_weight is None else sample_weight.copy(deep=False),
                cv,
                None
                if sensitive_features is None
                else sensitive_features.copy(deep=False),
            )
    
            tuner = MljarTuner(
                self._get_tuner_params(
                    self._start_random_models,
                    self._hill_climbing_steps,
                    self._top_models_to_improve,
                ),
                self._algorithms,
                self._ml_task,
                self._eval_metric,
                self._validation_strategy,
                self._explain_level,
                self._data_info,
                self._golden_features,
                self._features_selection,
                self._train_ensemble,
                self._stack_models,
                self._adjust_validation,
                self._boost_on_errors,
                self._kmeans_features,
                self._mix_encoding,
                self._optuna_time_budget,
                self._optuna_init_params,
                self._optuna_verbose,
                self._n_jobs,
                self._random_state,
                self._fairness_metric,
                self._fairness_threshold,
                self._privileged_groups,
                self._underprivileged_groups,
            )
            self.tuner = tuner
    
            steps = tuner.steps()
            self.verbose_print(
                f'AutoML steps: {[s for s in steps if "update_" not in s]}'
            )
            if self._time_ctrl is None:
                self._time_ctrl = TimeController(
                    self._start_time,
                    self._total_time_limit,
                    self._model_time_limit,
                    steps,
                    self._algorithms,
                )
    
            self._time_ctrl.log_time(
                "prepare_data",
                "prepare_data",
                "prepare_data",
                time.time() - self._start_time,
            )
    
            for step in steps:
                self._fit_level = step
                start = time.time()
                # self._time_start[step] = start
    
                if step in ["stack", "ensemble_stacked"] and not self._stack_models:
                    continue
    
                if step == "stack":
                    self.prepare_for_stacking()
                if "hill_climbing" in step or step in ["ensemble", "stack"]:
                    if len(self._models) == 0:
                        raise AutoMLException(
                            "No models produced. \nPlease check your data or"
                            " submit a Github issue at https://github.com/mljar/mljar-supervised/issues/new."
                        )
    
                generated_params = []
                if step in self._all_params:
                    generated_params = self._all_params[step]
                else:
                    generated_params = tuner.generate_params(
                        step,
                        self._models,
                        self._results_path,
                        self._stacked_models,
                        self._total_time_limit,
                    )
    
                if generated_params is None or not generated_params:
                    if "_update_" not in step:
                        self.verbose_print(
                            f"Skip {step} because no parameters were generated."
                        )
                    continue
                if generated_params:
                    if not self._time_ctrl.enough_time_for_step(self._fit_level):
                        self.verbose_print(f"Skip {step} because of the time limit.")
                        continue
                    else:
                        model_str = "models" if len(generated_params) > 1 else "model"
                        self.verbose_print(
                            f"* Step {step} will try to check up to {len(generated_params)} {model_str}"
                        )
    
                for params in generated_params:
                    if params.get("status", "") in ["trained", "skipped", "error"]:
                        self.verbose_print(f"{params['name']}: {params['status']}.")
                        continue
    
                    try:
                        trained = False
                        if "ensemble" in step:
                            trained = self.ensemble_step(
                                is_stacked=params["is_stacked"]
                            )
                        else:
                            trained = self.train_model(params)
                        params["status"] = "trained" if trained else "skipped"
                        params["final_loss"] = self._models[-1].get_final_loss()
                        params["train_time"] = self._models[-1].get_train_time()
    
                        if (
                            self._adjust_validation
                            and len(self._models) == 1
                            and step == "adjust_validation"
                        ):
                            self._set_adjusted_validation()
    
                    except NotTrainedException as e:
                        params["status"] = "error"
                        self.verbose_print(
                            params.get("name") + " not trained. " + str(e)
                        )
                    except Exception as e:
                        import traceback
    
                        self._update_errors_report(
                            params.get("name"), str(e) + "\n" + traceback.format_exc()
                        )
                        params["status"] = "error"
    
                    self.save_progress(step, generated_params)
    
            if not self._models:
>               raise AutoMLException("No models produced.")
E               supervised.exceptions.AutoMLException: No models produced.

supervised/base_automl.py:1206: AutoMLException
------------------------------ Captured log call -------------------------------
ERROR    supervised.exceptions:exceptions.py:15 No models produced.
=========================== short test summary info ============================
FAILED tests/tests_automl/test_data_types.py::AutoMLDataTypesTest::test_category_data_type
============================== 1 failed in 2.12s ===============================

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions