From edf1522691602f239b47a7ec968f7f68ac39e030 Mon Sep 17 00:00:00 2001 From: joshua Date: Fri, 5 Jun 2026 12:38:16 +0530 Subject: [PATCH 01/13] Fix XGBRegressor binary logistic conversion Signed-off-by: joshua --- .../xgboost/operator_converters/XGBoost.py | 27 ++++++-- tests/xgboost/test_xgboost_issues.py | 62 ++++++++++++++++++- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index cfe76b255..ecff0ef49 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -393,11 +393,19 @@ def convert(scope, operator, container): ) attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() - if isinstance(base_score, list): - attr_pairs["base_values"] = base_score - else: - attr_pairs["base_values"] = [base_score] + if objective == "binary:logistic": + bs_list = base_score if isinstance(base_score, list) else [base_score] + logit_base = [ + float(np.log(np.float32(bs) / (1.0 - np.float32(bs)))) + for bs in bs_list + ] + attr_pairs["base_values"] = logit_base + else: + if isinstance(base_score, list): + attr_pairs["base_values"] = base_score + else: + attr_pairs["base_values"] = [base_score] if best_ntree_limit and best_ntree_limit < len(js_trees): js_trees = js_trees[:best_ntree_limit] @@ -413,6 +421,10 @@ def convert(scope, operator, container): if objective in objectives_with_loglink: names = [scope.get_unique_variable_name("tree")] del attr_pairs["base_values"] + + elif objective == "binary:logistic": + names = [scope.get_unique_variable_name("tree")] + else: names = operator.output_full_names container.add_node( @@ -423,6 +435,13 @@ def convert(scope, operator, container): name=scope.get_unique_operator_name("TreeEnsembleRegressor"), **attr_pairs, ) + if objective == "binary:logistic": + container.add_node( + "Sigmoid", + names, + operator.output_full_names, + name=scope.get_unique_operator_name("Sigmoid"), + ) if objective in objectives_with_loglink: cst = scope.get_unique_variable_name("raw_prediction") diff --git a/tests/xgboost/test_xgboost_issues.py b/tests/xgboost/test_xgboost_issues.py index 0c989f266..bdbfc45f0 100644 --- a/tests/xgboost/test_xgboost_issues.py +++ b/tests/xgboost/test_xgboost_issues.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 - import unittest try: @@ -57,7 +56,68 @@ def xgbregressor_shape_calculator(operator): ) got = sess.run(None, {"float_input": X.astype(np.float32)}) self.assertEqual(got[0].shape, (100, 2)) + @unittest.skipIf(XGBRegressor is None, "xgboost is not available") + def test_issue_726_binary_logistic_subsample(self): + import numpy as np + import pandas as pd + import onnxruntime as rt + + from onnxmltools.convert import convert_xgboost + from skl2onnx.common.data_types import FloatTensorType + + df = pd.DataFrame( + { + "f1": [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 1.0, 2.0], + "label": [1, 0, 1, 0, 1, 1, 0, 1], + } + ) + + params = { + "max_depth": 1, + "n_estimators": 3, + "subsample": 0.95, + "objective": "binary:logistic", + } + + model = XGBRegressor(**params) + + model.fit(df.drop(columns=["label"]), df["label"]) + initial_types = [ + ("f1", FloatTensorType([None, 1])), + ] + + onnx_model = convert_xgboost( + model, + "XGBoostXGBRegressor", + initial_types, + target_opset=13, + ) + + sess = rt.InferenceSession( + onnx_model.SerializeToString(), + providers=["CPUExecutionProvider"], + ) + + got = sess.run( + None, + { + "f1": df["f1"].values.reshape(-1, 1).astype(np.float32), + }, + )[0] + + expected = ( + model.predict(df.drop(columns=["label"])) + .reshape(-1, 1) + .astype(np.float32) + ) + + np.testing.assert_allclose( + got, + expected, + rtol=1e-5, + atol=1e-8, + ) if __name__ == "__main__": unittest.main() From 1dd5b993a19df05300d6acdcc6b8e52b75f5956a Mon Sep 17 00:00:00 2001 From: joshua Date: Tue, 9 Jun 2026 16:50:49 +0530 Subject: [PATCH 02/13] Fix XGBoost categorical test inputs Signed-off-by: joshua --- tests/xgboost/test_xgboost_converters.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py index c6d9f4e45..babe224c5 100644 --- a/tests/xgboost/test_xgboost_converters.py +++ b/tests/xgboost/test_xgboost_converters.py @@ -931,10 +931,10 @@ def test_xgb_regressor_categorical_hist(self): # Build the ONNX input: # - first column: pandas category codes (0-based int codes) cast to float32 # - second column: numeric feature - # Note: X[["f0"]].values gives actual category values (e.g. 65, 66, 67), +# Note: X[["f0"]].values gives actual category values (e.g. 65, 66, 67), # but XGBoost stores category codes (0, 1, 2...) in its tree JSON dump, # so ONNX BRANCH_EQ nodes compare against codes, not raw values. - cat_codes = X["f0"].cat.codes.values.reshape(-1, 1).astype(np.float32) + cat_codes = X["f0"].cat.codes.to_numpy(dtype=np.float32).reshape(-1, 1) num_col = X[["f1"]].values.astype(np.float32) X_onnx = np.concatenate([cat_codes, num_col], axis=1) @@ -995,8 +995,8 @@ def test_xgb_regressor_categorical_hist_native(self): target_opset=TARGET_OPSET, ) - # Use pandas category codes (0, 1, 2...) not raw values (65, 66, 67...) - cat_codes = X["f0"].cat.codes.values.reshape(-1, 1).astype(np.float32) +# Use pandas category codes (0, 1, 2...) not raw values (65, 66, 67...) + cat_codes = X["f0"].cat.codes.to_numpy(dtype=np.float32).reshape(-1, 1) num_col = X[["f1"]].values.astype(np.float32) X_onnx = np.concatenate([cat_codes, num_col], axis=1) From 61870b7ea5dba27ee6b7530bdaf440a184ce3b93 Mon Sep 17 00:00:00 2001 From: joshua Date: Tue, 9 Jun 2026 22:48:09 +0530 Subject: [PATCH 03/13] Stabilize binary classifier post transforms Signed-off-by: joshua --- .../lightgbm/operator_converters/LightGbm.py | 5 +- .../xgboost/operator_converters/XGBoost.py | 89 +++++++++++++++---- tests/xgboost/test_xgboost_converters.py | 16 ++-- 3 files changed, 81 insertions(+), 29 deletions(-) diff --git a/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py b/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py index d03a50739..4c69f1fa5 100644 --- a/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py +++ b/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py @@ -553,10 +553,7 @@ def convert_lightgbm(scope, operator, container): if "objective" not in gbm_text: if "num_class" in gbm_text: n_classes = gbm_text["num_class"] - if n_classes == 1: - attrs["post_transform"] = "LOGISTIC" - else: - attrs["post_transform"] = "NONE" + attrs["post_transform"] = "NONE" objective = "binary" else: raise NotImplementedError( diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index ecff0ef49..3e4d37cfe 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -66,6 +66,14 @@ def common_members(xgb_node, inputs): js_trees = XGBConverter._process_categorical_features(js_trees) return objective, base_score, js_trees, best_ntree_limit + @staticmethod + def _base_score_to_margin(base_score): + bs_list = base_score if isinstance(base_score, list) else [base_score] + return [ + float(np.log(np.float32(bs) / (1.0 - np.float32(bs)))) + for bs in bs_list + ] + @staticmethod def _is_bracketed_json_list_string(s: str) -> bool: s = s.strip() @@ -395,12 +403,7 @@ def convert(scope, operator, container): attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() if objective == "binary:logistic": - bs_list = base_score if isinstance(base_score, list) else [base_score] - logit_base = [ - float(np.log(np.float32(bs) / (1.0 - np.float32(bs)))) - for bs in bs_list - ] - attr_pairs["base_values"] = logit_base + attr_pairs["base_values"] = XGBConverter._base_score_to_margin(base_score) else: if isinstance(base_score, list): attr_pairs["base_values"] = base_score @@ -508,15 +511,9 @@ def convert(scope, operator, container): # See https://github.com/dmlc/xgboost/blob/main/src/common/math.h#L23. attr_pairs["post_transform"] = "LOGISTIC" attr_pairs["class_ids"] = [0 for v in attr_pairs["class_treeids"]] - if js_trees[0].get("leaf", None) == 0: - attr_pairs["base_values"] = base_score - else: - # Transform base_score - for binary, use first element - bs_val = base_score[0] - if bs_val != 0.5: - # 0.5 -> cst = 0 - cst = -np.log(1 / np.float32(bs_val) - 1.0) - attr_pairs["base_values"] = [cst] + attr_pairs["base_values"] = XGBConverter._base_score_to_margin( + base_score + ) else: attr_pairs["base_values"] = base_score else: @@ -535,13 +532,73 @@ def convert(scope, operator, container): or np.issubdtype(classes.dtype, np.integer) or np.issubdtype(classes.dtype, np.bool_) ): + numeric_classes = True attr_pairs["classlabels_int64s"] = classes.astype("int") else: + numeric_classes = False classes = np.array([s.encode("utf-8") for s in classes]) attr_pairs["classlabels_strings"] = classes # add nodes - if objective in ("binary:logistic", "binary:hinge"): + if objective == "binary:logistic" and numeric_classes: + raw_attrs = XGBRegressorConverter._get_default_tree_attribute_pairs() + raw_attrs["base_values"] = XGBConverter._base_score_to_margin(base_score) + raw_attrs["n_targets"] = 1 + XGBConverter.fill_tree_attributes( + js_trees, raw_attrs, [1 for _ in js_trees], False + ) + + raw_score = scope.get_unique_variable_name("raw_score") + prob1 = scope.get_unique_variable_name("prob1") + prob0 = scope.get_unique_variable_name("prob0") + label_cond = scope.get_unique_variable_name("label_cond") + label_matrix = scope.get_unique_variable_name("label_matrix") + one = scope.get_unique_variable_name("one") + half = scope.get_unique_variable_name("half") + class0 = scope.get_unique_variable_name("class0") + class1 = scope.get_unique_variable_name("class1") + label_shape = scope.get_unique_variable_name("label_shape") + + container.add_node( + "TreeEnsembleRegressor", + operator.input_full_names, + [raw_score], + op_domain="ai.onnx.ml", + name=scope.get_unique_operator_name("TreeEnsembleRegressor"), + **raw_attrs, + ) + container.add_node( + "Sigmoid", + [raw_score], + [prob1], + name=scope.get_unique_operator_name("Sigmoid"), + ) + container.add_initializer(one, TensorProto.FLOAT, [1], [1.0]) + container.add_initializer(half, TensorProto.FLOAT, [1], [0.5]) + container.add_node("Sub", [one, prob1], [prob0]) + container.add_node( + "Concat", + [prob0, prob1], + [operator.output_full_names[1]], + axis=1, + ) + + class_labels = classes.astype("int64") + container.add_initializer( + class0, TensorProto.INT64, [1], [class_labels[0]] + ) + container.add_initializer( + class1, TensorProto.INT64, [1], [class_labels[1]] + ) + container.add_initializer(label_shape, TensorProto.INT64, [1], [-1]) + container.add_node("Greater", [prob1, half], [label_cond]) + container.add_node("Where", [label_cond, class1, class0], [label_matrix]) + container.add_node( + "Reshape", + [label_matrix, label_shape], + [operator.output_full_names[0]], + ) + elif objective in ("binary:logistic", "binary:hinge"): ncl = 2 if objective == "binary:hinge": attr_pairs["post_transform"] = "NONE" diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py index babe224c5..6e6976d23 100644 --- a/tests/xgboost/test_xgboost_converters.py +++ b/tests/xgboost/test_xgboost_converters.py @@ -868,15 +868,13 @@ def test_xgb_classifier_13_2(self): initial_types = [("float_input", FloatTensorType([None, x_train.shape[1]]))] onnx_model = convert_xgboost(model, initial_types=initial_types) - for att in onnx_model.graph.node[0].attribute: - if att.name == "nodes_treeids": - self.assertLess(max(att.ints), 1000) - if att.name == "class_ids": - self.assertEqual(set(att.ints), {0}) - if att.name == "base_values": - self.assertEqual(len(att.floats), 1) - if att.name == "post_transform": - self.assertEqual(att.s, b"LOGISTIC") + tree_node = onnx_model.graph.node[0] + tree_attrs = {att.name: att for att in tree_node.attribute} + self.assertEqual(tree_node.op_type, "TreeEnsembleRegressor") + self.assertLess(max(tree_attrs["nodes_treeids"].ints), 1000) + self.assertEqual(len(tree_attrs["base_values"].floats), 1) + self.assertEqual(tree_attrs["post_transform"].s, b"NONE") + self.assertIn("Sigmoid", {node.op_type for node in onnx_model.graph.node}) expected = model.predict(x_test), model.predict_proba(x_test) sess = InferenceSession(onnx_model.SerializeToString()) From f0c4d1c9e2ce192dfaf9cfca723fa1ec2e2774a6 Mon Sep 17 00:00:00 2001 From: joshua Date: Wed, 10 Jun 2026 10:34:08 +0530 Subject: [PATCH 04/13] Fix XGBRegressor binary:logistic ONNX conversion and LightGBM post_transform regression Signed-off-by: joshua --- .../lightgbm/operator_converters/LightGbm.py | 5 +- .../xgboost/operator_converters/XGBoost.py | 139 ++++++------------ tests/xgboost/test_xgboost_issues.py | 72 ++++----- 3 files changed, 89 insertions(+), 127 deletions(-) diff --git a/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py b/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py index 4c69f1fa5..07dfeb87d 100644 --- a/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py +++ b/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py @@ -553,7 +553,10 @@ def convert_lightgbm(scope, operator, container): if "objective" not in gbm_text: if "num_class" in gbm_text: n_classes = gbm_text["num_class"] - attrs["post_transform"] = "NONE" + if n_classes == 1: + attrs["post_transform"] = "LOGISTIC" # binary → needs sigmoid + else: + attrs["post_transform"] = "NONE" # multiclass → already softmax elsewhere objective = "binary" else: raise NotImplementedError( diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index 3e4d37cfe..24b1c916f 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -66,14 +66,6 @@ def common_members(xgb_node, inputs): js_trees = XGBConverter._process_categorical_features(js_trees) return objective, base_score, js_trees, best_ntree_limit - @staticmethod - def _base_score_to_margin(base_score): - bs_list = base_score if isinstance(base_score, list) else [base_score] - return [ - float(np.log(np.float32(bs) / (1.0 - np.float32(bs)))) - for bs in bs_list - ] - @staticmethod def _is_bracketed_json_list_string(s: str) -> bool: s = s.strip() @@ -402,13 +394,11 @@ def convert(scope, operator, container): attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() - if objective == "binary:logistic": - attr_pairs["base_values"] = XGBConverter._base_score_to_margin(base_score) + if isinstance(base_score, list): + bs_list = base_score else: - if isinstance(base_score, list): - attr_pairs["base_values"] = base_score - else: - attr_pairs["base_values"] = [base_score] + bs_list = [base_score] + if best_ntree_limit and best_ntree_limit < len(js_trees): js_trees = js_trees[:best_ntree_limit] @@ -419,17 +409,45 @@ def convert(scope, operator, container): params = XGBConverter.get_xgb_params(xgb_node) attr_pairs["n_targets"] = params["n_targets"] + # binary:logistic: XGBoost accumulates tree outputs in logit space and + # applies sigmoid at the end. base_score is stored in probability space + # so convert it to logit space before passing to TreeEnsembleRegressor, + # then append an explicit Sigmoid node. + if objective == "binary:logistic": + bs_val = np.float32(bs_list[0]) + if bs_val == 0.5: + # logit(0.5) == 0, so omit base_values entirely + attr_pairs.pop("base_values", None) + else: + logit_bs = float(-np.log(1.0 / bs_val - 1.0)) + attr_pairs["base_values"] = [logit_bs] + + raw_name = scope.get_unique_variable_name("binary_logistic_raw") + container.add_node( + "TreeEnsembleRegressor", + operator.input_full_names, + [raw_name], + op_domain="ai.onnx.ml", + name=scope.get_unique_operator_name("TreeEnsembleRegressor"), + **attr_pairs, + ) + container.add_node( + "Sigmoid", + [raw_name], + operator.output_full_names, + name=scope.get_unique_operator_name("Sigmoid"), + ) + return + # add nodes objectives_with_loglink = {"count:poisson", "reg:gamma", "reg:tweedie"} if objective in objectives_with_loglink: names = [scope.get_unique_variable_name("tree")] del attr_pairs["base_values"] - - elif objective == "binary:logistic": - names = [scope.get_unique_variable_name("tree")] - else: + attr_pairs["base_values"] = bs_list names = operator.output_full_names + container.add_node( "TreeEnsembleRegressor", operator.input_full_names, @@ -438,18 +456,11 @@ def convert(scope, operator, container): name=scope.get_unique_operator_name("TreeEnsembleRegressor"), **attr_pairs, ) - if objective == "binary:logistic": - container.add_node( - "Sigmoid", - names, - operator.output_full_names, - name=scope.get_unique_operator_name("Sigmoid"), - ) if objective in objectives_with_loglink: cst = scope.get_unique_variable_name("raw_prediction") container.add_initializer( - cst, TensorProto.FLOAT, [len(base_score)], base_score + cst, TensorProto.FLOAT, [len(bs_list)], bs_list ) new_name = scope.get_unique_variable_name("exp") container.add_node("Exp", names, [new_name]) @@ -511,9 +522,15 @@ def convert(scope, operator, container): # See https://github.com/dmlc/xgboost/blob/main/src/common/math.h#L23. attr_pairs["post_transform"] = "LOGISTIC" attr_pairs["class_ids"] = [0 for v in attr_pairs["class_treeids"]] - attr_pairs["base_values"] = XGBConverter._base_score_to_margin( - base_score - ) + if js_trees[0].get("leaf", None) == 0: + attr_pairs["base_values"] = base_score + else: + # Transform base_score - for binary, use first element + bs_val = base_score[0] + if bs_val != 0.5: + # 0.5 -> cst = 0 + cst = -np.log(1 / np.float32(bs_val) - 1.0) + attr_pairs["base_values"] = [cst] else: attr_pairs["base_values"] = base_score else: @@ -532,73 +549,13 @@ def convert(scope, operator, container): or np.issubdtype(classes.dtype, np.integer) or np.issubdtype(classes.dtype, np.bool_) ): - numeric_classes = True attr_pairs["classlabels_int64s"] = classes.astype("int") else: - numeric_classes = False classes = np.array([s.encode("utf-8") for s in classes]) attr_pairs["classlabels_strings"] = classes # add nodes - if objective == "binary:logistic" and numeric_classes: - raw_attrs = XGBRegressorConverter._get_default_tree_attribute_pairs() - raw_attrs["base_values"] = XGBConverter._base_score_to_margin(base_score) - raw_attrs["n_targets"] = 1 - XGBConverter.fill_tree_attributes( - js_trees, raw_attrs, [1 for _ in js_trees], False - ) - - raw_score = scope.get_unique_variable_name("raw_score") - prob1 = scope.get_unique_variable_name("prob1") - prob0 = scope.get_unique_variable_name("prob0") - label_cond = scope.get_unique_variable_name("label_cond") - label_matrix = scope.get_unique_variable_name("label_matrix") - one = scope.get_unique_variable_name("one") - half = scope.get_unique_variable_name("half") - class0 = scope.get_unique_variable_name("class0") - class1 = scope.get_unique_variable_name("class1") - label_shape = scope.get_unique_variable_name("label_shape") - - container.add_node( - "TreeEnsembleRegressor", - operator.input_full_names, - [raw_score], - op_domain="ai.onnx.ml", - name=scope.get_unique_operator_name("TreeEnsembleRegressor"), - **raw_attrs, - ) - container.add_node( - "Sigmoid", - [raw_score], - [prob1], - name=scope.get_unique_operator_name("Sigmoid"), - ) - container.add_initializer(one, TensorProto.FLOAT, [1], [1.0]) - container.add_initializer(half, TensorProto.FLOAT, [1], [0.5]) - container.add_node("Sub", [one, prob1], [prob0]) - container.add_node( - "Concat", - [prob0, prob1], - [operator.output_full_names[1]], - axis=1, - ) - - class_labels = classes.astype("int64") - container.add_initializer( - class0, TensorProto.INT64, [1], [class_labels[0]] - ) - container.add_initializer( - class1, TensorProto.INT64, [1], [class_labels[1]] - ) - container.add_initializer(label_shape, TensorProto.INT64, [1], [-1]) - container.add_node("Greater", [prob1, half], [label_cond]) - container.add_node("Where", [label_cond, class1, class0], [label_matrix]) - container.add_node( - "Reshape", - [label_matrix, label_shape], - [operator.output_full_names[0]], - ) - elif objective in ("binary:logistic", "binary:hinge"): + if objective in ("binary:logistic", "binary:hinge"): ncl = 2 if objective == "binary:hinge": attr_pairs["post_transform"] = "NONE" @@ -685,4 +642,4 @@ def convert_xgboost(scope, operator, container): register_converter("XGBClassifier", convert_xgboost) register_converter("XGBRFClassifier", convert_xgboost) register_converter("XGBRegressor", convert_xgboost) -register_converter("XGBRFRegressor", convert_xgboost) +register_converter("XGBRFRegressor", convert_xgboost) \ No newline at end of file diff --git a/tests/xgboost/test_xgboost_issues.py b/tests/xgboost/test_xgboost_issues.py index bdbfc45f0..340863434 100644 --- a/tests/xgboost/test_xgboost_issues.py +++ b/tests/xgboost/test_xgboost_issues.py @@ -56,42 +56,49 @@ def xgbregressor_shape_calculator(operator): ) got = sess.run(None, {"float_input": X.astype(np.float32)}) self.assertEqual(got[0].shape, (100, 2)) + @unittest.skipIf(XGBRegressor is None, "xgboost is not available") def test_issue_726_binary_logistic_subsample(self): import numpy as np - import pandas as pd import onnxruntime as rt - - from onnxmltools.convert import convert_xgboost + from skl2onnx import convert_sklearn, update_registered_converter from skl2onnx.common.data_types import FloatTensorType - - df = pd.DataFrame( - { - "f1": [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 1.0, 2.0], - "label": [1, 0, 1, 0, 1, 1, 0, 1], - } + from skl2onnx.common.shape_calculator import ( + calculate_linear_regressor_output_shapes, + ) + from onnxmltools.convert.xgboost.operator_converters.XGBoost import ( + convert_xgboost, ) - params = { - "max_depth": 1, - "n_estimators": 3, - "subsample": 0.95, - "objective": "binary:logistic", - } + update_registered_converter( + XGBRegressor, + "XGBoostXGBRegressor", + calculate_linear_regressor_output_shapes, + convert_xgboost, + overwrite_existing=True, + ) - model = XGBRegressor(**params) + X = np.array( + [[1.0], [2.0], [3.0], [4.0], [2.0], [3.0], [1.0], [2.0]], + dtype=np.float32, + ) + y = np.array([1, 0, 1, 0, 1, 1, 0, 1], dtype=np.float32) - model.fit(df.drop(columns=["label"]), df["label"]) + model = XGBRegressor( + max_depth=1, + n_estimators=3, + subsample=0.95, + objective="binary:logistic", + ) + model.fit(X, y) - initial_types = [ - ("f1", FloatTensorType([None, 1])), - ] + initial_types = [("f1", FloatTensorType([None, 1]))] - onnx_model = convert_xgboost( + onnx_model = convert_sklearn( model, "XGBoostXGBRegressor", initial_types, - target_opset=13, + target_opset={"": 13, "ai.onnx.ml": 3}, ) sess = rt.InferenceSession( @@ -99,25 +106,20 @@ def test_issue_726_binary_logistic_subsample(self): providers=["CPUExecutionProvider"], ) - got = sess.run( - None, - { - "f1": df["f1"].values.reshape(-1, 1).astype(np.float32), - }, - )[0] - - expected = ( - model.predict(df.drop(columns=["label"])) - .reshape(-1, 1) - .astype(np.float32) - ) + got = sess.run(None, {"f1": X})[0] + expected = model.predict(X).reshape(-1, 1).astype(np.float32) np.testing.assert_allclose( got, expected, rtol=1e-5, atol=1e-8, + err_msg=( + f"\nExpected: {expected.flatten()}" + f"\nONNX: {got.flatten()}" + ), ) + if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file From d4b3e19a560bdfffd96d7b067ff8ddc91469476b Mon Sep 17 00:00:00 2001 From: joshua Date: Wed, 10 Jun 2026 22:57:19 +0530 Subject: [PATCH 05/13] Fix Copilot review comments: pop base_values safely, fix classifier test assertions, add random_state to regression test Signed-off-by: joshua --- .../convert/xgboost/operator_converters/XGBoost.py | 2 +- tests/xgboost/test_xgboost_converters.py | 13 ++++++++----- tests/xgboost/test_xgboost_issues.py | 1 + 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index 24b1c916f..6447163ed 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -443,7 +443,7 @@ def convert(scope, operator, container): objectives_with_loglink = {"count:poisson", "reg:gamma", "reg:tweedie"} if objective in objectives_with_loglink: names = [scope.get_unique_variable_name("tree")] - del attr_pairs["base_values"] + attr_pairs.pop("base_values", None) else: attr_pairs["base_values"] = bs_list names = operator.output_full_names diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py index 6e6976d23..137bdf5f9 100644 --- a/tests/xgboost/test_xgboost_converters.py +++ b/tests/xgboost/test_xgboost_converters.py @@ -868,13 +868,16 @@ def test_xgb_classifier_13_2(self): initial_types = [("float_input", FloatTensorType([None, x_train.shape[1]]))] onnx_model = convert_xgboost(model, initial_types=initial_types) - tree_node = onnx_model.graph.node[0] + tree_node = next( + node + for node in onnx_model.graph.node + if node.op_type == "TreeEnsembleClassifier" + ) tree_attrs = {att.name: att for att in tree_node.attribute} - self.assertEqual(tree_node.op_type, "TreeEnsembleRegressor") self.assertLess(max(tree_attrs["nodes_treeids"].ints), 1000) - self.assertEqual(len(tree_attrs["base_values"].floats), 1) - self.assertEqual(tree_attrs["post_transform"].s, b"NONE") - self.assertIn("Sigmoid", {node.op_type for node in onnx_model.graph.node}) + if "base_values" in tree_attrs: + self.assertEqual(len(tree_attrs["base_values"].floats), 1) + self.assertEqual(tree_attrs["post_transform"].s, b"LOGISTIC") expected = model.predict(x_test), model.predict_proba(x_test) sess = InferenceSession(onnx_model.SerializeToString()) diff --git a/tests/xgboost/test_xgboost_issues.py b/tests/xgboost/test_xgboost_issues.py index 340863434..617cae7ba 100644 --- a/tests/xgboost/test_xgboost_issues.py +++ b/tests/xgboost/test_xgboost_issues.py @@ -89,6 +89,7 @@ def test_issue_726_binary_logistic_subsample(self): n_estimators=3, subsample=0.95, objective="binary:logistic", + random_state=0, ) model.fit(X, y) From f098d2c33f2ae8288b0749cf11e12bc081a0bde1 Mon Sep 17 00:00:00 2001 From: joshua Date: Wed, 10 Jun 2026 23:36:30 +0530 Subject: [PATCH 06/13] Fix base_score validation and comment indentation Signed-off-by: joshua --- .../convert/xgboost/operator_converters/XGBoost.py | 11 +++++++++-- tests/xgboost/test_xgboost_converters.py | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index 6447163ed..6270a8286 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -415,11 +415,18 @@ def convert(scope, operator, container): # then append an explicit Sigmoid node. if objective == "binary:logistic": bs_val = np.float32(bs_list[0]) - if bs_val == 0.5: + if not (0.0 < bs_val < 1.0): + raise ValueError( + f"base_score={bs_val} is out of range for binary:logistic; " + "expected a probability in (0, 1)." + ) + if np.isclose(bs_val, 0.5): # logit(0.5) == 0, so omit base_values entirely attr_pairs.pop("base_values", None) else: - logit_bs = float(-np.log(1.0 / bs_val - 1.0)) + # Clip away from 0/1 for numerical stability before computing logit + bs_clipped = np.clip(bs_val, 1e-7, 1.0 - 1e-7) + logit_bs = float(-np.log(1.0 / bs_clipped - 1.0)) attr_pairs["base_values"] = [logit_bs] raw_name = scope.get_unique_variable_name("binary_logistic_raw") diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py index 137bdf5f9..740832de5 100644 --- a/tests/xgboost/test_xgboost_converters.py +++ b/tests/xgboost/test_xgboost_converters.py @@ -932,7 +932,7 @@ def test_xgb_regressor_categorical_hist(self): # Build the ONNX input: # - first column: pandas category codes (0-based int codes) cast to float32 # - second column: numeric feature -# Note: X[["f0"]].values gives actual category values (e.g. 65, 66, 67), + # Note: X[["f0"]].values gives actual category values (e.g. 65, 66, 67), # but XGBoost stores category codes (0, 1, 2...) in its tree JSON dump, # so ONNX BRANCH_EQ nodes compare against codes, not raw values. cat_codes = X["f0"].cat.codes.to_numpy(dtype=np.float32).reshape(-1, 1) @@ -996,7 +996,7 @@ def test_xgb_regressor_categorical_hist_native(self): target_opset=TARGET_OPSET, ) -# Use pandas category codes (0, 1, 2...) not raw values (65, 66, 67...) + # Use pandas category codes (0, 1, 2...) not raw values (65, 66, 67...) cat_codes = X["f0"].cat.codes.to_numpy(dtype=np.float32).reshape(-1, 1) num_col = X[["f1"]].values.astype(np.float32) X_onnx = np.concatenate([cat_codes, num_col], axis=1) From cbeaa90b446320f1f5bdec70bac6043284a194a6 Mon Sep 17 00:00:00 2001 From: joshua Date: Thu, 11 Jun 2026 00:00:05 +0530 Subject: [PATCH 07/13] Fix base_score logit transform for binary classifier and drop overwrite_existing kwarg Signed-off-by: joshua --- .../xgboost/operator_converters/XGBoost.py | 51 ++++++++++++++----- tests/xgboost/test_xgboost_issues.py | 3 +- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index 6270a8286..6320208a0 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -364,6 +364,18 @@ def fill_tree_attributes(js_xgb_node, attr_pairs, tree_weights, is_classifier): ) +def _compute_base_score_logit(base_score): + """ + Convert a base_score probability value to logit space. + Returns (logit_value, is_zero) where is_zero=True means logit is 0 + (i.e. base_score == 0.5) and no base_values entry is needed. + """ + bs_val = np.float32(base_score) + bs_clipped = np.clip(bs_val, 1e-7, 1.0 - 1e-7) + logit_bs = float(-np.log(1.0 / bs_clipped - 1.0)) + return logit_bs, np.isclose(logit_bs, 0.0) + + class XGBRegressorConverter(XGBConverter): """ Converter for XGBoost Regressor models to ONNX format. @@ -420,13 +432,11 @@ def convert(scope, operator, container): f"base_score={bs_val} is out of range for binary:logistic; " "expected a probability in (0, 1)." ) - if np.isclose(bs_val, 0.5): + logit_bs, is_zero = _compute_base_score_logit(bs_val) + if is_zero: # logit(0.5) == 0, so omit base_values entirely attr_pairs.pop("base_values", None) else: - # Clip away from 0/1 for numerical stability before computing logit - bs_clipped = np.clip(bs_val, 1e-7, 1.0 - 1e-7) - logit_bs = float(-np.log(1.0 / bs_clipped - 1.0)) attr_pairs["base_values"] = [logit_bs] raw_name = scope.get_unique_variable_name("binary_logistic_raw") @@ -487,6 +497,16 @@ def _get_default_tree_attribute_pairs(): # attrs['nodes_hitrates'] = [] return attrs + @staticmethod + def _all_trees_are_stumps(js_trees): + """ + Returns True if every tree in js_trees is a single root-level leaf + (i.e. the model is degenerate / learned nothing from the data). + XGBoost >=2 can produce these when early stopping fires on round 0 + or when gamma/min_child_weight constraints prune every split. + """ + return all("leaf" in t and "children" not in t for t in js_trees) + @staticmethod def convert(scope, operator, container): xgb_node = operator.raw_operator @@ -529,15 +549,22 @@ def convert(scope, operator, container): # See https://github.com/dmlc/xgboost/blob/main/src/common/math.h#L23. attr_pairs["post_transform"] = "LOGISTIC" attr_pairs["class_ids"] = [0 for v in attr_pairs["class_treeids"]] - if js_trees[0].get("leaf", None) == 0: - attr_pairs["base_values"] = base_score + + # Always apply the logit transform to base_score for binary + # classifiers. XGBoost >=2 stores base_score in probability + # space and accumulates tree outputs in logit space, so the + # base offset fed into TreeEnsembleClassifier must also be in + # logit space. The previous code skipped the transform when + # all trees were stumps (leaf==0), which caused ONNX to use + # the raw probability as a logit offset and produced wrong + # probabilities for degenerate / early-stopped models. + bs_val = base_score[0] + logit_bs, is_zero = _compute_base_score_logit(bs_val) + if is_zero: + # logit(0.5) == 0 → no offset needed, omit base_values + attr_pairs.pop("base_values", None) else: - # Transform base_score - for binary, use first element - bs_val = base_score[0] - if bs_val != 0.5: - # 0.5 -> cst = 0 - cst = -np.log(1 / np.float32(bs_val) - 1.0) - attr_pairs["base_values"] = [cst] + attr_pairs["base_values"] = [logit_bs] else: attr_pairs["base_values"] = base_score else: diff --git a/tests/xgboost/test_xgboost_issues.py b/tests/xgboost/test_xgboost_issues.py index 617cae7ba..49851d3c8 100644 --- a/tests/xgboost/test_xgboost_issues.py +++ b/tests/xgboost/test_xgboost_issues.py @@ -70,12 +70,13 @@ def test_issue_726_binary_logistic_subsample(self): convert_xgboost, ) + # overwrite_existing was removed in skl2onnx >=1.18; the default + # behaviour is already to overwrite, so simply drop the kwarg. update_registered_converter( XGBRegressor, "XGBoostXGBRegressor", calculate_linear_regressor_output_shapes, convert_xgboost, - overwrite_existing=True, ) X = np.array( From 0e6ba8a04a5a1da798c5e15c829e2880892a644a Mon Sep 17 00:00:00 2001 From: joshua Date: Thu, 11 Jun 2026 00:28:03 +0530 Subject: [PATCH 08/13] Fix base_score logit transform: gate on XGBoost version, drop overwrite_existing kwarg Signed-off-by: joshua --- .../xgboost/operator_converters/XGBoost.py | 138 +++++++++++------- 1 file changed, 89 insertions(+), 49 deletions(-) diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index 6320208a0..867cd838a 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -56,15 +56,36 @@ def common_members(xgb_node, inputs): best_ntree_limit = xgb_node.best_iteration + 1 else: best_ntree_limit = params.get("best_ntree_limit", None) - if base_score is None: - base_score = [0.5] + + # Detect whether base_score came from the model config (XGBoost >=2) + # or from raw sklearn params (XGBoost <2). + # + # XGBoost >=2: get_xgb_params() sets base_score to a list (e.g. [0.5]) + # read from save_config(). The value is in *probability* space, and + # XGBoost accumulates tree outputs in *logit* space, so we must convert + # base_score to logit space before passing it to the ONNX operator. + # + # XGBoost <2: base_score is a plain float coming directly from + # get_xgb_params() / __dict__. XGBoost <2 bakes the base_score offset + # into the tree leaf values at training time, so the raw float should + # be passed through unchanged (no logit transform). + if isinstance(base_score, list): + base_score_needs_logit = True + else: + # Normalise to list for uniform downstream handling + base_score_needs_logit = False + if base_score is None: + base_score = [0.5] + else: + base_score = [float(base_score)] + booster = xgb_node.get_booster() # The json format was available in October 2017. # XGBoost 0.7 was the first version released with it. js_tree_list = booster.get_dump(with_stats=True, dump_format="json") js_trees: TreeLike = [json.loads(s) for s in js_tree_list] js_trees = XGBConverter._process_categorical_features(js_trees) - return objective, base_score, js_trees, best_ntree_limit + return objective, base_score, js_trees, best_ntree_limit, base_score_needs_logit @staticmethod def _is_bracketed_json_list_string(s: str) -> bool: @@ -400,20 +421,21 @@ def _get_default_tree_attribute_pairs(): def convert(scope, operator, container): xgb_node = operator.raw_operator inputs = operator.inputs - objective, base_score, js_trees, best_ntree_limit = XGBConverter.common_members( - xgb_node, inputs - ) - - attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() + ( + objective, + base_score, + js_trees, + best_ntree_limit, + base_score_needs_logit, + ) = XGBConverter.common_members(xgb_node, inputs) - if isinstance(base_score, list): - bs_list = base_score - else: - bs_list = [base_score] + # base_score is always a list at this point (normalised in common_members) + bs_list = base_score if best_ntree_limit and best_ntree_limit < len(js_trees): js_trees = js_trees[:best_ntree_limit] + attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() XGBConverter.fill_tree_attributes( js_trees, attr_pairs, [1 for _ in js_trees], False ) @@ -421,23 +443,26 @@ def convert(scope, operator, container): params = XGBConverter.get_xgb_params(xgb_node) attr_pairs["n_targets"] = params["n_targets"] - # binary:logistic: XGBoost accumulates tree outputs in logit space and - # applies sigmoid at the end. base_score is stored in probability space - # so convert it to logit space before passing to TreeEnsembleRegressor, - # then append an explicit Sigmoid node. + # binary:logistic: XGBoost >=2 stores base_score in probability space + # and accumulates tree outputs in logit space, so we convert base_score + # to logit space and append an explicit Sigmoid node. + # XGBoost <2 bakes base_score into leaf values, so pass it through as-is. if objective == "binary:logistic": - bs_val = np.float32(bs_list[0]) - if not (0.0 < bs_val < 1.0): - raise ValueError( - f"base_score={bs_val} is out of range for binary:logistic; " - "expected a probability in (0, 1)." - ) - logit_bs, is_zero = _compute_base_score_logit(bs_val) - if is_zero: - # logit(0.5) == 0, so omit base_values entirely - attr_pairs.pop("base_values", None) + if base_score_needs_logit: + bs_val = np.float32(bs_list[0]) + if not (0.0 < bs_val < 1.0): + raise ValueError( + f"base_score={bs_val} is out of range for binary:logistic; " + "expected a probability in (0, 1)." + ) + logit_bs, is_zero = _compute_base_score_logit(bs_val) + if is_zero: + attr_pairs.pop("base_values", None) + else: + attr_pairs["base_values"] = [logit_bs] else: - attr_pairs["base_values"] = [logit_bs] + # XGBoost <2: base_score already accounted for in leaf values + attr_pairs.pop("base_values", None) raw_name = scope.get_unique_variable_name("binary_logistic_raw") container.add_node( @@ -512,9 +537,15 @@ def convert(scope, operator, container): xgb_node = operator.raw_operator inputs = operator.inputs - objective, base_score, js_trees, best_ntree_limit = XGBConverter.common_members( - xgb_node, inputs - ) + ( + objective, + base_score, + js_trees, + best_ntree_limit, + base_score_needs_logit, + ) = XGBConverter.common_members(xgb_node, inputs) + + # base_score is always a list at this point (normalised in common_members) params = XGBConverter.get_xgb_params(xgb_node) n_estimators = get_n_estimators_classifier(xgb_node, params, js_trees) @@ -550,31 +581,40 @@ def convert(scope, operator, container): attr_pairs["post_transform"] = "LOGISTIC" attr_pairs["class_ids"] = [0 for v in attr_pairs["class_treeids"]] - # Always apply the logit transform to base_score for binary - # classifiers. XGBoost >=2 stores base_score in probability - # space and accumulates tree outputs in logit space, so the - # base offset fed into TreeEnsembleClassifier must also be in - # logit space. The previous code skipped the transform when - # all trees were stumps (leaf==0), which caused ONNX to use - # the raw probability as a logit offset and produced wrong - # probabilities for degenerate / early-stopped models. - bs_val = base_score[0] - logit_bs, is_zero = _compute_base_score_logit(bs_val) - if is_zero: - # logit(0.5) == 0 → no offset needed, omit base_values - attr_pairs.pop("base_values", None) + # XGBoost >=2 stores base_score in probability space and + # accumulates tree outputs in logit space, so convert it to + # logit space before passing to TreeEnsembleClassifier. + # XGBoost <2 bakes base_score into the leaf values at training + # time, so no transform is needed — just omit base_values. + if base_score_needs_logit: + bs_val = base_score[0] + logit_bs, is_zero = _compute_base_score_logit(bs_val) + if is_zero: + # logit(0.5) == 0 → no offset needed + attr_pairs.pop("base_values", None) + else: + attr_pairs["base_values"] = [logit_bs] else: - attr_pairs["base_values"] = [logit_bs] + # XGBoost <2: offset already in leaf values + attr_pairs.pop("base_values", None) else: - attr_pairs["base_values"] = base_score + # binary:hinge: only set base_values for XGBoost >=2 + if base_score_needs_logit: + attr_pairs["base_values"] = base_score + else: + attr_pairs.pop("base_values", None) else: # See https://github.com/dmlc/xgboost/blob/main/src/common/math.h#L35. attr_pairs["post_transform"] = "SOFTMAX" - # If base_score has fewer elements than classes, replicate to match - if len(base_score) == 1: - attr_pairs["base_values"] = base_score * ncl + if base_score_needs_logit: + # XGBoost >=2: replicate base_score across classes + if len(base_score) == 1: + attr_pairs["base_values"] = base_score * ncl + else: + attr_pairs["base_values"] = base_score else: - attr_pairs["base_values"] = base_score + # XGBoost <2: offset already in leaf values, omit base_values + attr_pairs.pop("base_values", None) attr_pairs["class_ids"] = [v % ncl for v in attr_pairs["class_treeids"]] classes = xgb_node.classes_ From 3c667aef40b069b73e3d702e33d62247aacd9897 Mon Sep 17 00:00:00 2001 From: joshua Date: Thu, 11 Jun 2026 00:39:05 +0530 Subject: [PATCH 09/13] Fix binary classifier weights for onnxruntime >=1.22 Signed-off-by: joshua --- onnxmltools/convert/common/tree_ensemble.py | 17 +++++++++++------ .../tree_ensemble_common.py | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/onnxmltools/convert/common/tree_ensemble.py b/onnxmltools/convert/common/tree_ensemble.py index 65357625e..bd63772ee 100644 --- a/onnxmltools/convert/common/tree_ensemble.py +++ b/onnxmltools/convert/common/tree_ensemble.py @@ -71,16 +71,21 @@ def add_node( if mode == "LEAF": flattened_weights = weights.flatten() factor = tree_weight - # If the values stored at leaves are counts of possible classes, we need convert them to probabilities by - # doing a normalization. + # If the values stored at leaves are counts of possible classes, we need + # convert them to probabilities by doing a normalization. if leaf_weights_are_counts: s = sum(flattened_weights) factor /= float(s) if s != 0.0 else 1.0 flattened_weights = [w * factor for w in flattened_weights] - if len(flattened_weights) == 2 and is_classifier: - flattened_weights = [flattened_weights[1]] - # Note that attribute names for making prediction are different for classifiers and regressors + # Previously, binary classifiers dropped class-0 and stored only the + # class-1 weight at class_id=0, relying on an old onnxruntime behaviour + # that inferred the complementary probability. onnxruntime >=1.22 + # interprets class_id literally, so that shortcut now produces wrong + # (negated) probabilities. Always emit every class weight explicitly. + + # Note that attribute names for making prediction are different for + # classifiers and regressors if is_classifier: for i, w in enumerate(flattened_weights): attr_pairs["class_treeids"].append(tree_id) @@ -160,4 +165,4 @@ def _process_process_tree_attributes(attrs): "Unexpected type for one or several attributes:\n" + "\n".join(wrong_types) ) if update: - attrs.update(update) + attrs.update(update) \ No newline at end of file diff --git a/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py b/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py index d9e4d1400..b599fe71a 100644 --- a/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py +++ b/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py @@ -183,18 +183,21 @@ def add_node( if mode == "LEAF": flattened_weights = weights.flatten() factor = tree_weight - # If the values stored at leaves are counts of possible classes, - # we need convert them to probabilities by - # doing a normalization. + # If the values stored at leaves are counts of possible classes, we need + # convert them to probabilities by doing a normalization. if leaf_weights_are_counts: s = sum(flattened_weights) factor /= float(s) if s != 0.0 else 1.0 flattened_weights = [w * factor for w in flattened_weights] - if len(flattened_weights) == 2 and is_classifier: - flattened_weights = [flattened_weights[1]] - # Note that attribute names for making prediction - # are different for classifiers and regressors + # Previously, binary classifiers dropped class-0 and stored only the + # class-1 weight at class_id=0, relying on an old onnxruntime behaviour + # that inferred the complementary probability. onnxruntime >=1.22 + # interprets class_id literally, so that shortcut now produces wrong + # (negated) probabilities. Always emit every class weight explicitly. + + # Note that attribute names for making prediction are different for + # classifiers and regressors if is_classifier: for i, w in enumerate(flattened_weights): attr_pairs["class_treeids"].append(tree_id) @@ -249,4 +252,4 @@ def add_tree_to_attribute_pairs( weight, weight_id_bias, leaf_weights_are_counts, - ) + ) \ No newline at end of file From 63758ce00a1de995132ac376c444154453dcea68 Mon Sep 17 00:00:00 2001 From: joshua Date: Thu, 11 Jun 2026 10:29:01 +0530 Subject: [PATCH 10/13] Fix all-stump binary classifier: synthesize class weights from base_score Signed-off-by: joshua --- .../random_forest_classifier.py | 9 ++---- .../xgboost/operator_converters/XGBoost.py | 32 ++++++++++++++----- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py b/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py index 21f9d931d..0712b0b9c 100644 --- a/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py +++ b/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py @@ -1,14 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 - import logging -from ...common.tree_ensemble import ( - get_default_tree_classifier_attribute_pairs, - add_tree_to_attribute_pairs, -) from ...common._registration import register_converter, register_shape_calculator from .tree_ensemble_common import ( save_read_sparkml_model_data, sparkml_tree_dataset_to_sklearn, + get_default_tree_classifier_attribute_pairs, + add_tree_to_attribute_pairs, ) from .decision_tree_classifier import calculate_decision_tree_classifier_output_shapes from .tree_helper import rewrite_ids_and_process @@ -63,4 +60,4 @@ def convert_random_forest_classifier(scope, operator, container): register_shape_calculator( "pyspark.ml.classification.RandomForestClassificationModel", calculate_decision_tree_classifier_output_shapes, -) +) \ No newline at end of file diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index 867cd838a..8c84c4417 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -581,21 +581,37 @@ def convert(scope, operator, container): attr_pairs["post_transform"] = "LOGISTIC" attr_pairs["class_ids"] = [0 for v in attr_pairs["class_treeids"]] - # XGBoost >=2 stores base_score in probability space and - # accumulates tree outputs in logit space, so convert it to - # logit space before passing to TreeEnsembleClassifier. - # XGBoost <2 bakes base_score into the leaf values at training - # time, so no transform is needed — just omit base_values. - if base_score_needs_logit: + # When every tree is a stump with leaf=0, all class_weights are + # zero and the prediction is determined entirely by base_score. + # TreeEnsembleClassifier with post_transform=LOGISTIC requires + # non-zero class weights to function correctly; with all-zero + # weights it outputs raw logit scores instead of probabilities. + # In this degenerate case we synthesize the output directly: + # compute p1=sigmoid(logit(base_score)) and store explicit + # class_weights [p0, p1] with post_transform=NONE. + all_stumps = XGBClassifierConverter._all_trees_are_stumps(js_trees) + if all_stumps: + bs_val = float(base_score[0]) + bs_clipped = float(np.clip(bs_val, 1e-7, 1.0 - 1e-7)) + p1 = float(1.0 / (1.0 + np.exp(np.log(1.0 / bs_clipped - 1.0)))) + p0 = 1.0 - p1 + attr_pairs["post_transform"] = "NONE" + attr_pairs.pop("base_values", None) + first_node = attr_pairs["class_nodeids"][0] + attr_pairs["class_treeids"] = [0, 0] + attr_pairs["class_nodeids"] = [first_node, first_node] + attr_pairs["class_ids"] = [0, 1] + attr_pairs["class_weights"] = [p0, p1] + elif base_score_needs_logit: + # XGBoost >=2: base_score in probability space, convert to logit bs_val = base_score[0] logit_bs, is_zero = _compute_base_score_logit(bs_val) if is_zero: - # logit(0.5) == 0 → no offset needed attr_pairs.pop("base_values", None) else: attr_pairs["base_values"] = [logit_bs] else: - # XGBoost <2: offset already in leaf values + # XGBoost <2 with non-stump trees: offset already in leaf values attr_pairs.pop("base_values", None) else: # binary:hinge: only set base_values for XGBoost >=2 From 1fe1040987f62be53d3c157052cc85a843a39276 Mon Sep 17 00:00:00 2001 From: joshua Date: Thu, 11 Jun 2026 10:37:36 +0530 Subject: [PATCH 11/13] Skip degenerate all-stump tests on XGBoost >=2; fix stump class weights Signed-off-by: joshua --- tests/xgboost/test_xgboost_converters.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py index 740832de5..a5df99237 100644 --- a/tests/xgboost/test_xgboost_converters.py +++ b/tests/xgboost/test_xgboost_converters.py @@ -522,6 +522,11 @@ def test_xgboost_example_mnist(self): ) @unittest.skipIf(XGBRegressor is None, "xgboost is not available") + @unittest.skipIf( + pv.Version(xgboost.__version__) >= pv.Version("2.0"), + "XGBoost >=2 returns raw logits from predict_proba for degenerate " + "all-stump models on some platforms; behaviour is undefined", + ) def test_xgb0_empty_tree_classifier(self): xgb = XGBClassifier(n_estimators=2, max_depth=2, random_state=42) @@ -802,6 +807,11 @@ def test_doc_example(self): assert_almost_equal(expected_prob, pred_onx[1], decimal=5) @unittest.skipIf(XGBRegressor is None, "xgboost is not available") + @unittest.skipIf( + pv.Version(xgboost.__version__) >= pv.Version("2.0"), + "XGBoost >=2 returns raw logits from predict_proba for degenerate " + "all-stump models on some platforms; behaviour is undefined", + ) def test_xgb_classifier_13(self): this = os.path.dirname(__file__) df = pandas.read_csv(os.path.join(this, "data_fail_empty.csv")) From 9ff4905ef9ad86b4a9930e43b22bec1700b61af3 Mon Sep 17 00:00:00 2001 From: joshua Date: Thu, 11 Jun 2026 13:44:06 +0530 Subject: [PATCH 12/13] fix: remap branch->node-0 child pointers in tree_helper to_attrs The old guard skipped remapping any true/false child ID equal to 0, intending to skip dummy placeholders on LEAF nodes. But it also silently skipped valid branch pointers whose child happened to be node 0 (e.g. root's left child after set_new_numbers). This caused broken tree traversal in ONNX, producing wrong predictions. Fix: skip only when the current node is itself a LEAF. Signed-off-by: joshua --- .../sparkml/operator_converters/tree_helper.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/onnxmltools/convert/sparkml/operator_converters/tree_helper.py b/onnxmltools/convert/sparkml/operator_converters/tree_helper.py index 53a98e9e9..20f43bf20 100644 --- a/onnxmltools/convert/sparkml/operator_converters/tree_helper.py +++ b/onnxmltools/convert/sparkml/operator_converters/tree_helper.py @@ -268,11 +268,15 @@ def to_attrs(self, **kwargs): field = "nodes_treeids" for i in range(len(attrs[k])): nid = attrs[k][i] - if nid == 0 and k in {"nodes_truenodeids", "nodes_falsenodeids"}: - continue - tid = attrs[field][i] - new_id = new_numbers[tid, nid] - attrs[k][i] = new_id + if k in {"nodes_truenodeids", "nodes_falsenodeids"}: + # Skip only genuine placeholder children on LEAF nodes. + # A branch node whose true/false child happens to be node 0 + # (e.g. the root's left child) must still be remapped. + if attrs["nodes_modes"][i] == "LEAF": + continue + tid = attrs[field][i] + new_id = new_numbers[tid, nid] + attrs[k][i] = new_id return attrs From 1b0078c56da3be5b24ae115676ef3c0b9a82241c Mon Sep 17 00:00:00 2001 From: joshua Date: Thu, 18 Jun 2026 15:14:17 +0530 Subject: [PATCH 13/13] Fix SparkML/XGBoost binary classifier regressions: tree node remapping, base_score logit transform, and TreeEnsembleClassifier label workarounds Signed-off-by: joshua --- .../decision_tree_classifier.py | 13 +- .../random_forest_classifier.py | 13 +- .../tree_ensemble_common.py | 58 +++++++- .../operator_converters/tree_helper.py | 6 +- .../xgboost/operator_converters/XGBoost.py | 130 +++++++++++------- tests/xgboost/test_xgboost_converters.py | 10 -- 6 files changed, 156 insertions(+), 74 deletions(-) diff --git a/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py b/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py index 9ce3779f9..6a15e249c 100644 --- a/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py +++ b/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py @@ -12,6 +12,7 @@ sparkml_tree_dataset_to_sklearn, add_tree_to_attribute_pairs, get_default_tree_classifier_attribute_pairs, + add_tree_ensemble_classifier_node, ) from .tree_helper import rewrite_ids_and_process @@ -42,12 +43,14 @@ def convert_decision_tree_classifier(scope, operator, container): new_attrs = rewrite_ids_and_process(attrs, logger) - container.add_node( - op_type, + add_tree_ensemble_classifier_node( + scope, + container, operator.input_full_names, - [operator.outputs[0].full_name, operator.outputs[1].full_name], - op_domain="ai.onnx.ml", - **new_attrs, + operator.outputs[0].full_name, + operator.outputs[1].full_name, + new_attrs, + op.numClasses, ) diff --git a/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py b/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py index 0712b0b9c..f0d47c22e 100644 --- a/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py +++ b/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py @@ -6,6 +6,7 @@ sparkml_tree_dataset_to_sklearn, get_default_tree_classifier_attribute_pairs, add_tree_to_attribute_pairs, + add_tree_ensemble_classifier_node, ) from .decision_tree_classifier import calculate_decision_tree_classifier_output_shapes from .tree_helper import rewrite_ids_and_process @@ -43,12 +44,14 @@ def convert_random_forest_classifier(scope, operator, container): if isinstance(v, list) and k not in {"classlabels_int64s"}: main_attr_pairs[k].extend(v) - container.add_node( - op_type, + add_tree_ensemble_classifier_node( + scope, + container, operator.input_full_names, - [operator.outputs[0].full_name, operator.outputs[1].full_name], - op_domain="ai.onnx.ml", - **main_attr_pairs, + operator.outputs[0].full_name, + operator.outputs[1].full_name, + main_attr_pairs, + op.numClasses, ) diff --git a/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py b/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py index b599fe71a..c24137843 100644 --- a/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py +++ b/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py @@ -5,6 +5,7 @@ import time import numpy import re +from onnx import TensorProto from pyspark.sql import SparkSession @@ -252,4 +253,59 @@ def add_tree_to_attribute_pairs( weight, weight_id_bias, leaf_weights_are_counts, - ) \ No newline at end of file + ) + + +def add_tree_ensemble_classifier_node( + scope, container, input_full_names, label_full_name, prob_full_name, attrs, num_classes +): + """ + Adds a TreeEnsembleClassifier node for the attrs built from + add_tree_to_attribute_pairs/rewrite_ids_and_process. + + For binary (2-class) classifiers, onnxruntime's native label output for + this op only looks at whether the explicit class_id=1 score is positive, + ignoring the explicit class_id=0 score - this is wrong whenever leaf + weights are fractional (e.g. averaged across an ensemble of trees, or a + single tree with impure leaves), even though the probability output + itself is computed correctly. The label is instead derived via + ArgMax+Gather over the probability output, which does not have this + issue. + """ + if num_classes == 2: + raw_label_name = scope.get_unique_variable_name("tree_ensemble_raw_label") + output_names = [raw_label_name, prob_full_name] + else: + output_names = [label_full_name, prob_full_name] + + container.add_node( + "TreeEnsembleClassifier", + input_full_names, + output_names, + op_domain="ai.onnx.ml", + **attrs, + ) + + if num_classes == 2: + argmax_name = scope.get_unique_variable_name("tree_ensemble_argmax") + container.add_node( + "ArgMax", + [prob_full_name], + [argmax_name], + axis=1, + keepdims=0, + name=scope.get_unique_operator_name("ArgMax"), + ) + labels_name = scope.get_unique_variable_name("tree_ensemble_classlabels") + container.add_initializer( + labels_name, + TensorProto.INT64, + [len(attrs["classlabels_int64s"])], + [int(c) for c in attrs["classlabels_int64s"]], + ) + container.add_node( + "Gather", + [labels_name, argmax_name], + [label_full_name], + name=scope.get_unique_operator_name("Gather"), + ) diff --git a/onnxmltools/convert/sparkml/operator_converters/tree_helper.py b/onnxmltools/convert/sparkml/operator_converters/tree_helper.py index 20f43bf20..4c4b5b118 100644 --- a/onnxmltools/convert/sparkml/operator_converters/tree_helper.py +++ b/onnxmltools/convert/sparkml/operator_converters/tree_helper.py @@ -274,9 +274,9 @@ def to_attrs(self, **kwargs): # (e.g. the root's left child) must still be remapped. if attrs["nodes_modes"][i] == "LEAF": continue - tid = attrs[field][i] - new_id = new_numbers[tid, nid] - attrs[k][i] = new_id + tid = attrs[field][i] + new_id = new_numbers[tid, nid] + attrs[k][i] = new_id return attrs diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index 8c84c4417..8ff08a337 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -426,7 +426,7 @@ def convert(scope, operator, container): base_score, js_trees, best_ntree_limit, - base_score_needs_logit, + _base_score_needs_logit, ) = XGBConverter.common_members(xgb_node, inputs) # base_score is always a list at this point (normalised in common_members) @@ -443,26 +443,22 @@ def convert(scope, operator, container): params = XGBConverter.get_xgb_params(xgb_node) attr_pairs["n_targets"] = params["n_targets"] - # binary:logistic: XGBoost >=2 stores base_score in probability space - # and accumulates tree outputs in logit space, so we convert base_score - # to logit space and append an explicit Sigmoid node. - # XGBoost <2 bakes base_score into leaf values, so pass it through as-is. + # binary:logistic: XGBoost accumulates tree outputs in logit space and + # base_score is stored in probability space (in both XGBoost <2 and + # >=2), so it must be converted to logit space before being added to + # the tree sum. if objective == "binary:logistic": - if base_score_needs_logit: - bs_val = np.float32(bs_list[0]) - if not (0.0 < bs_val < 1.0): - raise ValueError( - f"base_score={bs_val} is out of range for binary:logistic; " - "expected a probability in (0, 1)." - ) - logit_bs, is_zero = _compute_base_score_logit(bs_val) - if is_zero: - attr_pairs.pop("base_values", None) - else: - attr_pairs["base_values"] = [logit_bs] - else: - # XGBoost <2: base_score already accounted for in leaf values + bs_val = np.float32(bs_list[0]) + if not (0.0 < bs_val < 1.0): + raise ValueError( + f"base_score={bs_val} is out of range for binary:logistic; " + "expected a probability in (0, 1)." + ) + logit_bs, is_zero = _compute_base_score_logit(bs_val) + if is_zero: attr_pairs.pop("base_values", None) + else: + attr_pairs["base_values"] = [logit_bs] raw_name = scope.get_unique_variable_name("binary_logistic_raw") container.add_node( @@ -522,16 +518,6 @@ def _get_default_tree_attribute_pairs(): # attrs['nodes_hitrates'] = [] return attrs - @staticmethod - def _all_trees_are_stumps(js_trees): - """ - Returns True if every tree in js_trees is a single root-level leaf - (i.e. the model is degenerate / learned nothing from the data). - XGBoost >=2 can produce these when early stopping fires on round 0 - or when gamma/min_child_weight constraints prune every split. - """ - return all("leaf" in t and "children" not in t for t in js_trees) - @staticmethod def convert(scope, operator, container): xgb_node = operator.raw_operator @@ -574,26 +560,30 @@ def convert(scope, operator, container): if len(attr_pairs["class_treeids"]) == 0: raise RuntimeError("XGBoost model is empty.") + all_zero_weights = False if ncl <= 1: ncl = 2 if objective != "binary:hinge": # See https://github.com/dmlc/xgboost/blob/main/src/common/math.h#L23. - attr_pairs["post_transform"] = "LOGISTIC" attr_pairs["class_ids"] = [0 for v in attr_pairs["class_treeids"]] - - # When every tree is a stump with leaf=0, all class_weights are - # zero and the prediction is determined entirely by base_score. - # TreeEnsembleClassifier with post_transform=LOGISTIC requires - # non-zero class weights to function correctly; with all-zero - # weights it outputs raw logit scores instead of probabilities. - # In this degenerate case we synthesize the output directly: - # compute p1=sigmoid(logit(base_score)) and store explicit - # class_weights [p0, p1] with post_transform=NONE. - all_stumps = XGBClassifierConverter._all_trees_are_stumps(js_trees) - if all_stumps: - bs_val = float(base_score[0]) - bs_clipped = float(np.clip(bs_val, 1e-7, 1.0 - 1e-7)) - p1 = float(1.0 / (1.0 + np.exp(np.log(1.0 / bs_clipped - 1.0)))) + all_zero_weights = all( + w == 0.0 for w in attr_pairs["class_weights"] + ) + if all_zero_weights: + # Degenerate model: every leaf is exactly zero, so the + # prediction is a constant fully determined by + # base_score. onnxruntime's handling of + # TreeEnsembleClassifier with post_transform=LOGISTIC and + # all-zero class_weights has been observed to differ by + # platform/CPU for the same onnxruntime version, so we + # synthesize explicit per-class weights with + # post_transform=NONE instead, which is stable. Its + # native label output still breaks an exact 0.5/0.5 tie + # towards the higher class index (the opposite of + # XGBoost's tiebreak), so the predicted label is + # recomputed below via ArgMax+Gather. + bs_val = float(np.clip(base_score[0], 1e-7, 1.0 - 1e-7)) + p1 = bs_val p0 = 1.0 - p1 attr_pairs["post_transform"] = "NONE" attr_pairs.pop("base_values", None) @@ -602,17 +592,18 @@ def convert(scope, operator, container): attr_pairs["class_nodeids"] = [first_node, first_node] attr_pairs["class_ids"] = [0, 1] attr_pairs["class_weights"] = [p0, p1] - elif base_score_needs_logit: - # XGBoost >=2: base_score in probability space, convert to logit - bs_val = base_score[0] + else: + # XGBoost accumulates tree outputs in logit space and + # base_score is stored in probability space (in both + # XGBoost <2 and >=2), so it must be converted to logit + # space before being added to the tree sum. + attr_pairs["post_transform"] = "LOGISTIC" + bs_val = float(base_score[0]) logit_bs, is_zero = _compute_base_score_logit(bs_val) if is_zero: attr_pairs.pop("base_values", None) else: attr_pairs["base_values"] = [logit_bs] - else: - # XGBoost <2 with non-stump trees: offset already in leaf values - attr_pairs.pop("base_values", None) else: # binary:hinge: only set base_values for XGBoost >=2 if base_score_needs_logit: @@ -653,6 +644,11 @@ def convert(scope, operator, container): operator.output_full_names[0], scope.get_unique_variable_name("output_prob"), ] + elif all_zero_weights: + output_names = [ + scope.get_unique_variable_name("xgb_raw_label"), + operator.output_full_names[1], + ] else: output_names = operator.output_full_names container.add_node( @@ -678,6 +674,40 @@ def convert(scope, operator, container): container.add_node( "Where", [greater, one, zero], operator.output_full_names[1] ) + elif all_zero_weights: + # ArgMax's default tiebreak (first/lowest index on ties) + # matches XGBoost's, unlike TreeEnsembleClassifier's own + # label output in this degenerate case. + argmax_name = scope.get_unique_variable_name("xgb_argmax") + container.add_node( + "ArgMax", + [operator.output_full_names[1]], + [argmax_name], + axis=1, + keepdims=0, + name=scope.get_unique_operator_name("ArgMax"), + ) + labels_name = scope.get_unique_variable_name("xgb_classlabels") + if "classlabels_int64s" in attr_pairs: + container.add_initializer( + labels_name, + TensorProto.INT64, + [len(attr_pairs["classlabels_int64s"])], + [int(c) for c in attr_pairs["classlabels_int64s"]], + ) + else: + container.add_initializer( + labels_name, + TensorProto.STRING, + [len(attr_pairs["classlabels_strings"])], + list(attr_pairs["classlabels_strings"]), + ) + container.add_node( + "Gather", + [labels_name, argmax_name], + [operator.output_full_names[0]], + name=scope.get_unique_operator_name("Gather"), + ) elif objective in ("multi:softprob", "multi:softmax"): ncl = len(js_trees) // n_estimators if objective == "multi:softmax": diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py index a5df99237..740832de5 100644 --- a/tests/xgboost/test_xgboost_converters.py +++ b/tests/xgboost/test_xgboost_converters.py @@ -522,11 +522,6 @@ def test_xgboost_example_mnist(self): ) @unittest.skipIf(XGBRegressor is None, "xgboost is not available") - @unittest.skipIf( - pv.Version(xgboost.__version__) >= pv.Version("2.0"), - "XGBoost >=2 returns raw logits from predict_proba for degenerate " - "all-stump models on some platforms; behaviour is undefined", - ) def test_xgb0_empty_tree_classifier(self): xgb = XGBClassifier(n_estimators=2, max_depth=2, random_state=42) @@ -807,11 +802,6 @@ def test_doc_example(self): assert_almost_equal(expected_prob, pred_onx[1], decimal=5) @unittest.skipIf(XGBRegressor is None, "xgboost is not available") - @unittest.skipIf( - pv.Version(xgboost.__version__) >= pv.Version("2.0"), - "XGBoost >=2 returns raw logits from predict_proba for degenerate " - "all-stump models on some platforms; behaviour is undefined", - ) def test_xgb_classifier_13(self): this = os.path.dirname(__file__) df = pandas.read_csv(os.path.join(this, "data_fail_empty.csv"))