merge conflicts

xadupre · xadupre · commit 8170301fc72b · 2025-07-24T17:35:31.000+02:00
Signed-off-by: xadupre &lt;xadupre@microsoft.com&gt;
diff --git a/CHANGELOGS.md b/CHANGELOGS.md
@@ -4,6 +4,8 @@
 
 * Fixes unknown_value=np.nan in OrdinalEncoder
   [#1198](https://github.com/onnx/sklearn-onnx/issues/1198)
+* Enhance OrdinalEncoder conversion to handle infrequent categories
+  [#1195](https://github.com/onnx/sklearn-onnx/issues/1195)
 
 ## 1.19.1
 
diff --git a/skl2onnx/operator_converters/ordinal_encoder.py b/skl2onnx/operator_converters/ordinal_encoder.py
@@ -46,6 +46,16 @@ def convert_sklearn_ordinal_encoder(
         if len(categories) == 0:
             continue
 
+        if (
+            hasattr(ordinal_op, "_infrequent_enabled")
+            and ordinal_op._infrequent_enabled
+        ):
+            default_to_infrequent_mappings = ordinal_op._default_to_infrequent_mappings[
+                input_idx
+            ]
+        else:
+            default_to_infrequent_mappings = None
+
         current_input = operator.inputs[input_idx]
         if current_input.get_second_dimension() == 1:
             feature_column = current_input
@@ -127,11 +137,28 @@ def convert_sklearn_ordinal_encoder(
             encoded_missing_value = np.array(
                 [int(ordinal_op.encoded_missing_value)]
             ).astype(dtype)
-            attrs[key] = np.concatenate(
-                (np.arange(len(categories) - 1).astype(dtype), encoded_missing_value)
-            )
+
+            # handle max_categories or min_frequency
+            if default_to_infrequent_mappings is not None:
+                attrs[key] = np.concatenate(
+                    (
+                        np.array(default_to_infrequent_mappings, dtype=dtype),
+                        encoded_missing_value,
+                    )
+                )
+            else:
+                attrs[key] = np.concatenate(
+                    (
+                        np.arange(len(categories) - 1).astype(dtype),
+                        encoded_missing_value,
+                    )
+                )
         else:
-            attrs[key] = np.arange(len(categories)).astype(dtype)
+            # handle max_categories or min_frequency
+            if default_to_infrequent_mappings is not None:
+                attrs[key] = np.array(default_to_infrequent_mappings, dtype=dtype)
+            else:
+                attrs[key] = np.arange(len(categories)).astype(dtype)
 
         if default_value or (
             isinstance(default_value, float) and np.isnan(default_value)
diff --git a/tests/test_sklearn_ordinal_encoder.py b/tests/test_sklearn_ordinal_encoder.py
@@ -40,6 +40,11 @@ def set_output_support():
     return pv.Version(vers) >= pv.Version("1.2")
 
 
+def max_categories_support():
+    vers = ".".join(sklearn_version.split(".")[:2])
+    return pv.Version(vers) >= pv.Version("1.3")
+
+
 class TestSklearnOrdinalEncoderConverter(unittest.TestCase):
     @unittest.skipIf(
         not ordinal_encoder_support(),
@@ -379,6 +384,86 @@ def test_ordinal_encoder_pipeline_string_int64(self):
         )
         assert_almost_equal(expected, got[0].ravel())
 
+    @unittest.skipIf(
+        not max_categories_support(),
+        reason="OrdinalEncoder supports max_categories and min_frequencey since 1.3",
+    )
+    def test_model_ordinal_encoder_max_categories(self):
+        from onnxruntime import InferenceSession
+
+        model = OrdinalEncoder(max_categories=4)
+        data = np.array(
+            [["a"], ["b"], ["c"], ["d"], ["a"], ["b"], ["c"], ["e"]], dtype=np.object_
+        )
+
+        expected = model.fit_transform(data)
+
+        model_onnx = convert_sklearn(
+            model,
+            "scikit-learn ordinal encoder",
+            [("input", StringTensorType([None, 1]))],
+            target_opset=TARGET_OPSET,
+        )
+        self.assertIsNotNone(model_onnx)
+        dump_data_and_model(
+            data,
+            model,
+            model_onnx,
+            basename="SklearnOrdinalEncoderMaxCategories",
+        )
+
+        sess = InferenceSession(
+            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(
+            None,
+            {
+                "input": data,
+            },
+        )
+
+        assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
+
+    @unittest.skipIf(
+        not max_categories_support(),
+        reason="OrdinalEncoder supports max_categories and min_frequencey since 1.3",
+    )
+    def test_model_ordinal_encoder_min_frequency(self):
+        from onnxruntime import InferenceSession
+
+        model = OrdinalEncoder(min_frequency=2)
+        data = np.array(
+            [["a"], ["b"], ["c"], ["d"], ["a"], ["b"], ["c"], ["e"]], dtype=np.object_
+        )
+
+        expected = model.fit_transform(data)
+
+        model_onnx = convert_sklearn(
+            model,
+            "scikit-learn ordinal encoder",
+            [("input", StringTensorType([None, 1]))],
+            target_opset=TARGET_OPSET,
+        )
+        self.assertIsNotNone(model_onnx)
+        dump_data_and_model(
+            data,
+            model,
+            model_onnx,
+            basename="SklearnOrdinalEncoderMinFrequency",
+        )
+
+        sess = InferenceSession(
+            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(
+            None,
+            {
+                "input": data,
+            },
+        )
+
+        assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
+
     @unittest.skipIf(
         not ordinal_encoder_support(),
         reason="OrdinalEncoder was not available before 0.20",