diff --git a/onnxmltools/convert/common/tree_ensemble.py b/onnxmltools/convert/common/tree_ensemble.py index 65357625e..bd63772ee 100644 --- a/onnxmltools/convert/common/tree_ensemble.py +++ b/onnxmltools/convert/common/tree_ensemble.py @@ -71,16 +71,21 @@ def add_node( if mode == "LEAF": flattened_weights = weights.flatten() factor = tree_weight - # If the values stored at leaves are counts of possible classes, we need convert them to probabilities by - # doing a normalization. + # If the values stored at leaves are counts of possible classes, we need + # convert them to probabilities by doing a normalization. if leaf_weights_are_counts: s = sum(flattened_weights) factor /= float(s) if s != 0.0 else 1.0 flattened_weights = [w * factor for w in flattened_weights] - if len(flattened_weights) == 2 and is_classifier: - flattened_weights = [flattened_weights[1]] - # Note that attribute names for making prediction are different for classifiers and regressors + # Previously, binary classifiers dropped class-0 and stored only the + # class-1 weight at class_id=0, relying on an old onnxruntime behaviour + # that inferred the complementary probability. onnxruntime >=1.22 + # interprets class_id literally, so that shortcut now produces wrong + # (negated) probabilities. Always emit every class weight explicitly. + + # Note that attribute names for making prediction are different for + # classifiers and regressors if is_classifier: for i, w in enumerate(flattened_weights): attr_pairs["class_treeids"].append(tree_id) @@ -160,4 +165,4 @@ def _process_process_tree_attributes(attrs): "Unexpected type for one or several attributes:\n" + "\n".join(wrong_types) ) if update: - attrs.update(update) + attrs.update(update) \ No newline at end of file diff --git a/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py b/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py index d03a50739..07dfeb87d 100644 --- a/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py +++ b/onnxmltools/convert/lightgbm/operator_converters/LightGbm.py @@ -554,9 +554,9 @@ def convert_lightgbm(scope, operator, container): if "num_class" in gbm_text: n_classes = gbm_text["num_class"] if n_classes == 1: - attrs["post_transform"] = "LOGISTIC" + attrs["post_transform"] = "LOGISTIC" # binary → needs sigmoid else: - attrs["post_transform"] = "NONE" + attrs["post_transform"] = "NONE" # multiclass → already softmax elsewhere objective = "binary" else: raise NotImplementedError( diff --git a/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py b/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py index 9ce3779f9..6a15e249c 100644 --- a/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py +++ b/onnxmltools/convert/sparkml/operator_converters/decision_tree_classifier.py @@ -12,6 +12,7 @@ sparkml_tree_dataset_to_sklearn, add_tree_to_attribute_pairs, get_default_tree_classifier_attribute_pairs, + add_tree_ensemble_classifier_node, ) from .tree_helper import rewrite_ids_and_process @@ -42,12 +43,14 @@ def convert_decision_tree_classifier(scope, operator, container): new_attrs = rewrite_ids_and_process(attrs, logger) - container.add_node( - op_type, + add_tree_ensemble_classifier_node( + scope, + container, operator.input_full_names, - [operator.outputs[0].full_name, operator.outputs[1].full_name], - op_domain="ai.onnx.ml", - **new_attrs, + operator.outputs[0].full_name, + operator.outputs[1].full_name, + new_attrs, + op.numClasses, ) diff --git a/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py b/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py index 21f9d931d..f0d47c22e 100644 --- a/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py +++ b/onnxmltools/convert/sparkml/operator_converters/random_forest_classifier.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 - import logging -from ...common.tree_ensemble import ( - get_default_tree_classifier_attribute_pairs, - add_tree_to_attribute_pairs, -) from ...common._registration import register_converter, register_shape_calculator from .tree_ensemble_common import ( save_read_sparkml_model_data, sparkml_tree_dataset_to_sklearn, + get_default_tree_classifier_attribute_pairs, + add_tree_to_attribute_pairs, + add_tree_ensemble_classifier_node, ) from .decision_tree_classifier import calculate_decision_tree_classifier_output_shapes from .tree_helper import rewrite_ids_and_process @@ -46,12 +44,14 @@ def convert_random_forest_classifier(scope, operator, container): if isinstance(v, list) and k not in {"classlabels_int64s"}: main_attr_pairs[k].extend(v) - container.add_node( - op_type, + add_tree_ensemble_classifier_node( + scope, + container, operator.input_full_names, - [operator.outputs[0].full_name, operator.outputs[1].full_name], - op_domain="ai.onnx.ml", - **main_attr_pairs, + operator.outputs[0].full_name, + operator.outputs[1].full_name, + main_attr_pairs, + op.numClasses, ) @@ -63,4 +63,4 @@ def convert_random_forest_classifier(scope, operator, container): register_shape_calculator( "pyspark.ml.classification.RandomForestClassificationModel", calculate_decision_tree_classifier_output_shapes, -) +) \ No newline at end of file diff --git a/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py b/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py index d9e4d1400..c24137843 100644 --- a/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py +++ b/onnxmltools/convert/sparkml/operator_converters/tree_ensemble_common.py @@ -5,6 +5,7 @@ import time import numpy import re +from onnx import TensorProto from pyspark.sql import SparkSession @@ -183,18 +184,21 @@ def add_node( if mode == "LEAF": flattened_weights = weights.flatten() factor = tree_weight - # If the values stored at leaves are counts of possible classes, - # we need convert them to probabilities by - # doing a normalization. + # If the values stored at leaves are counts of possible classes, we need + # convert them to probabilities by doing a normalization. if leaf_weights_are_counts: s = sum(flattened_weights) factor /= float(s) if s != 0.0 else 1.0 flattened_weights = [w * factor for w in flattened_weights] - if len(flattened_weights) == 2 and is_classifier: - flattened_weights = [flattened_weights[1]] - # Note that attribute names for making prediction - # are different for classifiers and regressors + # Previously, binary classifiers dropped class-0 and stored only the + # class-1 weight at class_id=0, relying on an old onnxruntime behaviour + # that inferred the complementary probability. onnxruntime >=1.22 + # interprets class_id literally, so that shortcut now produces wrong + # (negated) probabilities. Always emit every class weight explicitly. + + # Note that attribute names for making prediction are different for + # classifiers and regressors if is_classifier: for i, w in enumerate(flattened_weights): attr_pairs["class_treeids"].append(tree_id) @@ -250,3 +254,58 @@ def add_tree_to_attribute_pairs( weight_id_bias, leaf_weights_are_counts, ) + + +def add_tree_ensemble_classifier_node( + scope, container, input_full_names, label_full_name, prob_full_name, attrs, num_classes +): + """ + Adds a TreeEnsembleClassifier node for the attrs built from + add_tree_to_attribute_pairs/rewrite_ids_and_process. + + For binary (2-class) classifiers, onnxruntime's native label output for + this op only looks at whether the explicit class_id=1 score is positive, + ignoring the explicit class_id=0 score - this is wrong whenever leaf + weights are fractional (e.g. averaged across an ensemble of trees, or a + single tree with impure leaves), even though the probability output + itself is computed correctly. The label is instead derived via + ArgMax+Gather over the probability output, which does not have this + issue. + """ + if num_classes == 2: + raw_label_name = scope.get_unique_variable_name("tree_ensemble_raw_label") + output_names = [raw_label_name, prob_full_name] + else: + output_names = [label_full_name, prob_full_name] + + container.add_node( + "TreeEnsembleClassifier", + input_full_names, + output_names, + op_domain="ai.onnx.ml", + **attrs, + ) + + if num_classes == 2: + argmax_name = scope.get_unique_variable_name("tree_ensemble_argmax") + container.add_node( + "ArgMax", + [prob_full_name], + [argmax_name], + axis=1, + keepdims=0, + name=scope.get_unique_operator_name("ArgMax"), + ) + labels_name = scope.get_unique_variable_name("tree_ensemble_classlabels") + container.add_initializer( + labels_name, + TensorProto.INT64, + [len(attrs["classlabels_int64s"])], + [int(c) for c in attrs["classlabels_int64s"]], + ) + container.add_node( + "Gather", + [labels_name, argmax_name], + [label_full_name], + name=scope.get_unique_operator_name("Gather"), + ) diff --git a/onnxmltools/convert/sparkml/operator_converters/tree_helper.py b/onnxmltools/convert/sparkml/operator_converters/tree_helper.py index 53a98e9e9..4c4b5b118 100644 --- a/onnxmltools/convert/sparkml/operator_converters/tree_helper.py +++ b/onnxmltools/convert/sparkml/operator_converters/tree_helper.py @@ -268,8 +268,12 @@ def to_attrs(self, **kwargs): field = "nodes_treeids" for i in range(len(attrs[k])): nid = attrs[k][i] - if nid == 0 and k in {"nodes_truenodeids", "nodes_falsenodeids"}: - continue + if k in {"nodes_truenodeids", "nodes_falsenodeids"}: + # Skip only genuine placeholder children on LEAF nodes. + # A branch node whose true/false child happens to be node 0 + # (e.g. the root's left child) must still be remapped. + if attrs["nodes_modes"][i] == "LEAF": + continue tid = attrs[field][i] new_id = new_numbers[tid, nid] attrs[k][i] = new_id diff --git a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py index cfe76b255..8ff08a337 100644 --- a/onnxmltools/convert/xgboost/operator_converters/XGBoost.py +++ b/onnxmltools/convert/xgboost/operator_converters/XGBoost.py @@ -56,15 +56,36 @@ def common_members(xgb_node, inputs): best_ntree_limit = xgb_node.best_iteration + 1 else: best_ntree_limit = params.get("best_ntree_limit", None) - if base_score is None: - base_score = [0.5] + + # Detect whether base_score came from the model config (XGBoost >=2) + # or from raw sklearn params (XGBoost <2). + # + # XGBoost >=2: get_xgb_params() sets base_score to a list (e.g. [0.5]) + # read from save_config(). The value is in *probability* space, and + # XGBoost accumulates tree outputs in *logit* space, so we must convert + # base_score to logit space before passing it to the ONNX operator. + # + # XGBoost <2: base_score is a plain float coming directly from + # get_xgb_params() / __dict__. XGBoost <2 bakes the base_score offset + # into the tree leaf values at training time, so the raw float should + # be passed through unchanged (no logit transform). + if isinstance(base_score, list): + base_score_needs_logit = True + else: + # Normalise to list for uniform downstream handling + base_score_needs_logit = False + if base_score is None: + base_score = [0.5] + else: + base_score = [float(base_score)] + booster = xgb_node.get_booster() # The json format was available in October 2017. # XGBoost 0.7 was the first version released with it. js_tree_list = booster.get_dump(with_stats=True, dump_format="json") js_trees: TreeLike = [json.loads(s) for s in js_tree_list] js_trees = XGBConverter._process_categorical_features(js_trees) - return objective, base_score, js_trees, best_ntree_limit + return objective, base_score, js_trees, best_ntree_limit, base_score_needs_logit @staticmethod def _is_bracketed_json_list_string(s: str) -> bool: @@ -364,6 +385,18 @@ def fill_tree_attributes(js_xgb_node, attr_pairs, tree_weights, is_classifier): ) +def _compute_base_score_logit(base_score): + """ + Convert a base_score probability value to logit space. + Returns (logit_value, is_zero) where is_zero=True means logit is 0 + (i.e. base_score == 0.5) and no base_values entry is needed. + """ + bs_val = np.float32(base_score) + bs_clipped = np.clip(bs_val, 1e-7, 1.0 - 1e-7) + logit_bs = float(-np.log(1.0 / bs_clipped - 1.0)) + return logit_bs, np.isclose(logit_bs, 0.0) + + class XGBRegressorConverter(XGBConverter): """ Converter for XGBoost Regressor models to ONNX format. @@ -388,19 +421,21 @@ def _get_default_tree_attribute_pairs(): def convert(scope, operator, container): xgb_node = operator.raw_operator inputs = operator.inputs - objective, base_score, js_trees, best_ntree_limit = XGBConverter.common_members( - xgb_node, inputs - ) + ( + objective, + base_score, + js_trees, + best_ntree_limit, + _base_score_needs_logit, + ) = XGBConverter.common_members(xgb_node, inputs) - attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() - if isinstance(base_score, list): - attr_pairs["base_values"] = base_score - else: - attr_pairs["base_values"] = [base_score] + # base_score is always a list at this point (normalised in common_members) + bs_list = base_score if best_ntree_limit and best_ntree_limit < len(js_trees): js_trees = js_trees[:best_ntree_limit] + attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() XGBConverter.fill_tree_attributes( js_trees, attr_pairs, [1 for _ in js_trees], False ) @@ -408,13 +443,49 @@ def convert(scope, operator, container): params = XGBConverter.get_xgb_params(xgb_node) attr_pairs["n_targets"] = params["n_targets"] + # binary:logistic: XGBoost accumulates tree outputs in logit space and + # base_score is stored in probability space (in both XGBoost <2 and + # >=2), so it must be converted to logit space before being added to + # the tree sum. + if objective == "binary:logistic": + bs_val = np.float32(bs_list[0]) + if not (0.0 < bs_val < 1.0): + raise ValueError( + f"base_score={bs_val} is out of range for binary:logistic; " + "expected a probability in (0, 1)." + ) + logit_bs, is_zero = _compute_base_score_logit(bs_val) + if is_zero: + attr_pairs.pop("base_values", None) + else: + attr_pairs["base_values"] = [logit_bs] + + raw_name = scope.get_unique_variable_name("binary_logistic_raw") + container.add_node( + "TreeEnsembleRegressor", + operator.input_full_names, + [raw_name], + op_domain="ai.onnx.ml", + name=scope.get_unique_operator_name("TreeEnsembleRegressor"), + **attr_pairs, + ) + container.add_node( + "Sigmoid", + [raw_name], + operator.output_full_names, + name=scope.get_unique_operator_name("Sigmoid"), + ) + return + # add nodes objectives_with_loglink = {"count:poisson", "reg:gamma", "reg:tweedie"} if objective in objectives_with_loglink: names = [scope.get_unique_variable_name("tree")] - del attr_pairs["base_values"] + attr_pairs.pop("base_values", None) else: + attr_pairs["base_values"] = bs_list names = operator.output_full_names + container.add_node( "TreeEnsembleRegressor", operator.input_full_names, @@ -427,7 +498,7 @@ def convert(scope, operator, container): if objective in objectives_with_loglink: cst = scope.get_unique_variable_name("raw_prediction") container.add_initializer( - cst, TensorProto.FLOAT, [len(base_score)], base_score + cst, TensorProto.FLOAT, [len(bs_list)], bs_list ) new_name = scope.get_unique_variable_name("exp") container.add_node("Exp", names, [new_name]) @@ -452,9 +523,15 @@ def convert(scope, operator, container): xgb_node = operator.raw_operator inputs = operator.inputs - objective, base_score, js_trees, best_ntree_limit = XGBConverter.common_members( - xgb_node, inputs - ) + ( + objective, + base_score, + js_trees, + best_ntree_limit, + base_score_needs_logit, + ) = XGBConverter.common_members(xgb_node, inputs) + + # base_score is always a list at this point (normalised in common_members) params = XGBConverter.get_xgb_params(xgb_node) n_estimators = get_n_estimators_classifier(xgb_node, params, js_trees) @@ -483,31 +560,68 @@ def convert(scope, operator, container): if len(attr_pairs["class_treeids"]) == 0: raise RuntimeError("XGBoost model is empty.") + all_zero_weights = False if ncl <= 1: ncl = 2 if objective != "binary:hinge": # See https://github.com/dmlc/xgboost/blob/main/src/common/math.h#L23. - attr_pairs["post_transform"] = "LOGISTIC" attr_pairs["class_ids"] = [0 for v in attr_pairs["class_treeids"]] - if js_trees[0].get("leaf", None) == 0: - attr_pairs["base_values"] = base_score + all_zero_weights = all( + w == 0.0 for w in attr_pairs["class_weights"] + ) + if all_zero_weights: + # Degenerate model: every leaf is exactly zero, so the + # prediction is a constant fully determined by + # base_score. onnxruntime's handling of + # TreeEnsembleClassifier with post_transform=LOGISTIC and + # all-zero class_weights has been observed to differ by + # platform/CPU for the same onnxruntime version, so we + # synthesize explicit per-class weights with + # post_transform=NONE instead, which is stable. Its + # native label output still breaks an exact 0.5/0.5 tie + # towards the higher class index (the opposite of + # XGBoost's tiebreak), so the predicted label is + # recomputed below via ArgMax+Gather. + bs_val = float(np.clip(base_score[0], 1e-7, 1.0 - 1e-7)) + p1 = bs_val + p0 = 1.0 - p1 + attr_pairs["post_transform"] = "NONE" + attr_pairs.pop("base_values", None) + first_node = attr_pairs["class_nodeids"][0] + attr_pairs["class_treeids"] = [0, 0] + attr_pairs["class_nodeids"] = [first_node, first_node] + attr_pairs["class_ids"] = [0, 1] + attr_pairs["class_weights"] = [p0, p1] else: - # Transform base_score - for binary, use first element - bs_val = base_score[0] - if bs_val != 0.5: - # 0.5 -> cst = 0 - cst = -np.log(1 / np.float32(bs_val) - 1.0) - attr_pairs["base_values"] = [cst] + # XGBoost accumulates tree outputs in logit space and + # base_score is stored in probability space (in both + # XGBoost <2 and >=2), so it must be converted to logit + # space before being added to the tree sum. + attr_pairs["post_transform"] = "LOGISTIC" + bs_val = float(base_score[0]) + logit_bs, is_zero = _compute_base_score_logit(bs_val) + if is_zero: + attr_pairs.pop("base_values", None) + else: + attr_pairs["base_values"] = [logit_bs] else: - attr_pairs["base_values"] = base_score + # binary:hinge: only set base_values for XGBoost >=2 + if base_score_needs_logit: + attr_pairs["base_values"] = base_score + else: + attr_pairs.pop("base_values", None) else: # See https://github.com/dmlc/xgboost/blob/main/src/common/math.h#L35. attr_pairs["post_transform"] = "SOFTMAX" - # If base_score has fewer elements than classes, replicate to match - if len(base_score) == 1: - attr_pairs["base_values"] = base_score * ncl + if base_score_needs_logit: + # XGBoost >=2: replicate base_score across classes + if len(base_score) == 1: + attr_pairs["base_values"] = base_score * ncl + else: + attr_pairs["base_values"] = base_score else: - attr_pairs["base_values"] = base_score + # XGBoost <2: offset already in leaf values, omit base_values + attr_pairs.pop("base_values", None) attr_pairs["class_ids"] = [v % ncl for v in attr_pairs["class_treeids"]] classes = xgb_node.classes_ @@ -530,6 +644,11 @@ def convert(scope, operator, container): operator.output_full_names[0], scope.get_unique_variable_name("output_prob"), ] + elif all_zero_weights: + output_names = [ + scope.get_unique_variable_name("xgb_raw_label"), + operator.output_full_names[1], + ] else: output_names = operator.output_full_names container.add_node( @@ -555,6 +674,40 @@ def convert(scope, operator, container): container.add_node( "Where", [greater, one, zero], operator.output_full_names[1] ) + elif all_zero_weights: + # ArgMax's default tiebreak (first/lowest index on ties) + # matches XGBoost's, unlike TreeEnsembleClassifier's own + # label output in this degenerate case. + argmax_name = scope.get_unique_variable_name("xgb_argmax") + container.add_node( + "ArgMax", + [operator.output_full_names[1]], + [argmax_name], + axis=1, + keepdims=0, + name=scope.get_unique_operator_name("ArgMax"), + ) + labels_name = scope.get_unique_variable_name("xgb_classlabels") + if "classlabels_int64s" in attr_pairs: + container.add_initializer( + labels_name, + TensorProto.INT64, + [len(attr_pairs["classlabels_int64s"])], + [int(c) for c in attr_pairs["classlabels_int64s"]], + ) + else: + container.add_initializer( + labels_name, + TensorProto.STRING, + [len(attr_pairs["classlabels_strings"])], + list(attr_pairs["classlabels_strings"]), + ) + container.add_node( + "Gather", + [labels_name, argmax_name], + [operator.output_full_names[0]], + name=scope.get_unique_operator_name("Gather"), + ) elif objective in ("multi:softprob", "multi:softmax"): ncl = len(js_trees) // n_estimators if objective == "multi:softmax": @@ -609,4 +762,4 @@ def convert_xgboost(scope, operator, container): register_converter("XGBClassifier", convert_xgboost) register_converter("XGBRFClassifier", convert_xgboost) register_converter("XGBRegressor", convert_xgboost) -register_converter("XGBRFRegressor", convert_xgboost) +register_converter("XGBRFRegressor", convert_xgboost) \ No newline at end of file diff --git a/tests/xgboost/test_xgboost_converters.py b/tests/xgboost/test_xgboost_converters.py index c6d9f4e45..740832de5 100644 --- a/tests/xgboost/test_xgboost_converters.py +++ b/tests/xgboost/test_xgboost_converters.py @@ -868,15 +868,16 @@ def test_xgb_classifier_13_2(self): initial_types = [("float_input", FloatTensorType([None, x_train.shape[1]]))] onnx_model = convert_xgboost(model, initial_types=initial_types) - for att in onnx_model.graph.node[0].attribute: - if att.name == "nodes_treeids": - self.assertLess(max(att.ints), 1000) - if att.name == "class_ids": - self.assertEqual(set(att.ints), {0}) - if att.name == "base_values": - self.assertEqual(len(att.floats), 1) - if att.name == "post_transform": - self.assertEqual(att.s, b"LOGISTIC") + tree_node = next( + node + for node in onnx_model.graph.node + if node.op_type == "TreeEnsembleClassifier" + ) + tree_attrs = {att.name: att for att in tree_node.attribute} + self.assertLess(max(tree_attrs["nodes_treeids"].ints), 1000) + if "base_values" in tree_attrs: + self.assertEqual(len(tree_attrs["base_values"].floats), 1) + self.assertEqual(tree_attrs["post_transform"].s, b"LOGISTIC") expected = model.predict(x_test), model.predict_proba(x_test) sess = InferenceSession(onnx_model.SerializeToString()) @@ -934,7 +935,7 @@ def test_xgb_regressor_categorical_hist(self): # Note: X[["f0"]].values gives actual category values (e.g. 65, 66, 67), # but XGBoost stores category codes (0, 1, 2...) in its tree JSON dump, # so ONNX BRANCH_EQ nodes compare against codes, not raw values. - cat_codes = X["f0"].cat.codes.values.reshape(-1, 1).astype(np.float32) + cat_codes = X["f0"].cat.codes.to_numpy(dtype=np.float32).reshape(-1, 1) num_col = X[["f1"]].values.astype(np.float32) X_onnx = np.concatenate([cat_codes, num_col], axis=1) @@ -996,7 +997,7 @@ def test_xgb_regressor_categorical_hist_native(self): ) # Use pandas category codes (0, 1, 2...) not raw values (65, 66, 67...) - cat_codes = X["f0"].cat.codes.values.reshape(-1, 1).astype(np.float32) + cat_codes = X["f0"].cat.codes.to_numpy(dtype=np.float32).reshape(-1, 1) num_col = X[["f1"]].values.astype(np.float32) X_onnx = np.concatenate([cat_codes, num_col], axis=1) diff --git a/tests/xgboost/test_xgboost_issues.py b/tests/xgboost/test_xgboost_issues.py index 0c989f266..49851d3c8 100644 --- a/tests/xgboost/test_xgboost_issues.py +++ b/tests/xgboost/test_xgboost_issues.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 - import unittest try: @@ -58,6 +57,71 @@ def xgbregressor_shape_calculator(operator): got = sess.run(None, {"float_input": X.astype(np.float32)}) self.assertEqual(got[0].shape, (100, 2)) + @unittest.skipIf(XGBRegressor is None, "xgboost is not available") + def test_issue_726_binary_logistic_subsample(self): + import numpy as np + import onnxruntime as rt + from skl2onnx import convert_sklearn, update_registered_converter + from skl2onnx.common.data_types import FloatTensorType + from skl2onnx.common.shape_calculator import ( + calculate_linear_regressor_output_shapes, + ) + from onnxmltools.convert.xgboost.operator_converters.XGBoost import ( + convert_xgboost, + ) + + # overwrite_existing was removed in skl2onnx >=1.18; the default + # behaviour is already to overwrite, so simply drop the kwarg. + update_registered_converter( + XGBRegressor, + "XGBoostXGBRegressor", + calculate_linear_regressor_output_shapes, + convert_xgboost, + ) + + X = np.array( + [[1.0], [2.0], [3.0], [4.0], [2.0], [3.0], [1.0], [2.0]], + dtype=np.float32, + ) + y = np.array([1, 0, 1, 0, 1, 1, 0, 1], dtype=np.float32) + + model = XGBRegressor( + max_depth=1, + n_estimators=3, + subsample=0.95, + objective="binary:logistic", + random_state=0, + ) + model.fit(X, y) + + initial_types = [("f1", FloatTensorType([None, 1]))] + + onnx_model = convert_sklearn( + model, + "XGBoostXGBRegressor", + initial_types, + target_opset={"": 13, "ai.onnx.ml": 3}, + ) + + sess = rt.InferenceSession( + onnx_model.SerializeToString(), + providers=["CPUExecutionProvider"], + ) + + got = sess.run(None, {"f1": X})[0] + expected = model.predict(X).reshape(-1, 1).astype(np.float32) + + np.testing.assert_allclose( + got, + expected, + rtol=1e-5, + atol=1e-8, + err_msg=( + f"\nExpected: {expected.flatten()}" + f"\nONNX: {got.flatten()}" + ), + ) + if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file