Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions onnxmltools/convert/xgboost/operator_converters/XGBoost.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,14 @@ def _add_node(

@staticmethod
def _fill_node_attributes(
treeid, tree_weight, jsnode, attr_pairs, is_classifier, remap, ids_covered: set
treeid,
tree_weight,
jsnode,
attr_pairs,
is_classifier,
remap,
ids_covered: set,
target_id=0,
):
node_id = remap[jsnode["nodeid"]]
if node_id in ids_covered:
Expand Down Expand Up @@ -316,13 +323,14 @@ def _fill_node_attributes(
is_classifier,
remap,
ids_covered,
target_id,
)
else:
raise RuntimeError("Unable to convert this node {0}".format(ch))

else:
weights = [jsnode["leaf"]]
weights_id_bias = 0
weights_id_bias = target_id
XGBConverter._add_node(
attr_pairs=attr_pairs,
is_classifier=is_classifier,
Expand Down Expand Up @@ -353,14 +361,24 @@ def _remap_nodeid(jsnode, remap=None):
return remap

@staticmethod
def fill_tree_attributes(js_xgb_node, attr_pairs, tree_weights, is_classifier):
def fill_tree_attributes(
js_xgb_node, attr_pairs, tree_weights, is_classifier, tree_info=None
):
if not isinstance(js_xgb_node, list):
raise TypeError("js_xgb_node must be a list")
for treeid, (jstree, w) in enumerate(zip(js_xgb_node, tree_weights)):
remap = XGBConverter._remap_nodeid(jstree)
ids_covered = set()
target_id = tree_info[treeid] if tree_info is not None else 0
XGBConverter._fill_node_attributes(
treeid, w, jstree, attr_pairs, is_classifier, remap, ids_covered
treeid,
w,
jstree,
attr_pairs,
is_classifier,
remap,
ids_covered,
target_id,
)


Expand Down Expand Up @@ -401,12 +419,23 @@ def convert(scope, operator, container):
if best_ntree_limit and best_ntree_limit < len(js_trees):
js_trees = js_trees[:best_ntree_limit]

params = XGBConverter.get_xgb_params(xgb_node)
n_targets = params["n_targets"]

tree_info = None
if n_targets > 1:
raw = json.loads(xgb_node.get_booster().save_raw(raw_format="json"))
tree_info = raw["learner"]["gradient_booster"]["model"]["tree_info"]
if best_ntree_limit and best_ntree_limit < len(tree_info):
tree_info = tree_info[:best_ntree_limit]
Comment on lines +427 to +430

Copilot AI Apr 28, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tree_info is obtained by calling save_raw(raw_format="json") and then json.loads(...), which parses the entire model a second time (in addition to get_dump(..., dump_format="json")). For large multi-target models this can significantly increase conversion time/memory. Consider extracting only the tree_info array (e.g., via a lightweight parse/regex similar to onnxmltools/convert/xgboost/_parse.py:_get_attributes) or using a deterministic mapping (like round-robin treeid % n_targets) as a fallback when full JSON parsing isn’t necessary.

Suggested change
raw = json.loads(xgb_node.get_booster().save_raw(raw_format="json"))
tree_info = raw["learner"]["gradient_booster"]["model"]["tree_info"]
if best_ntree_limit and best_ntree_limit < len(tree_info):
tree_info = tree_info[:best_ntree_limit]
raw = xgb_node.get_booster().save_raw(raw_format="json")
if isinstance(raw, bytes):
raw = raw.decode("utf-8")
tree_info_key = '"tree_info"'
tree_info = None
key_pos = raw.find(tree_info_key)
if key_pos >= 0:
array_start = raw.find("[", key_pos)
if array_start >= 0:
depth = 0
array_end = -1
for i in range(array_start, len(raw)):
if raw[i] == "[":
depth += 1
elif raw[i] == "]":
depth -= 1
if depth == 0:
array_end = i + 1
break
if array_end > 0:
try:
tree_info = json.loads(raw[array_start:array_end])
except (TypeError, ValueError):
tree_info = None
if tree_info is None:
tree_info = [tree_id % n_targets for tree_id in range(len(js_trees))]
if best_ntree_limit and best_ntree_limit < len(tree_info):
tree_info = tree_info[:best_ntree_limit]
elif len(tree_info) != len(js_trees):
tree_info = [tree_id % n_targets for tree_id in range(len(js_trees))]

Copilot uses AI. Check for mistakes.

XGBConverter.fill_tree_attributes(
js_trees, attr_pairs, [1 for _ in js_trees], False
js_trees, attr_pairs, [1 for _ in js_trees], False, tree_info
)

params = XGBConverter.get_xgb_params(xgb_node)
attr_pairs["n_targets"] = params["n_targets"]
attr_pairs["n_targets"] = n_targets
if len(attr_pairs["base_values"]) == 1 and n_targets > 1:
attr_pairs["base_values"] = attr_pairs["base_values"] * n_targets

# add nodes
objectives_with_loglink = {"count:poisson", "reg:gamma", "reg:tweedie"}
Expand Down
14 changes: 11 additions & 3 deletions onnxmltools/convert/xgboost/shape_calculators/Regressor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
# SPDX-License-Identifier: Apache-2.0

from ...common._registration import register_shape_calculator
from ...common.shape_calculator import calculate_linear_regressor_output_shapes
from ...common.data_types import FloatTensorType
from ..common import get_xgb_params

register_shape_calculator("XGBRegressor", calculate_linear_regressor_output_shapes)
register_shape_calculator("XGBRFRegressor", calculate_linear_regressor_output_shapes)

def calculate_xgboost_regressor_output_shapes(operator):
N = operator.inputs[0].type.shape[0]
n_targets = get_xgb_params(operator.raw_operator).get("n_targets", 1)
operator.outputs[0].type = FloatTensorType([N, n_targets])
Comment on lines +8 to +11

Copilot AI Apr 28, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new regressor shape calculator no longer validates the expected input/output counts (the previous calculate_linear_regressor_output_shapes did). This can allow silent shape inference issues if the operator is wired incorrectly. Consider calling check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1) (and, if consistent with other calculators, check_input_and_output_types) before setting the output type.

Copilot uses AI. Check for mistakes.


register_shape_calculator("XGBRegressor", calculate_xgboost_regressor_output_shapes)
register_shape_calculator("XGBRFRegressor", calculate_xgboost_regressor_output_shapes)
64 changes: 62 additions & 2 deletions tests/xgboost/test_xgboost_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import unittest

import numpy as np
from numpy.testing import assert_allclose
Comment on lines +5 to +6

Copilot AI Apr 28, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the new top-level import numpy as np, the inner import numpy as np inside test_issue_676 is now redundant. Consider removing the inner import to avoid duplication and keep imports consistent within the test module.

Copilot uses AI. Check for mistakes.

try:
from xgboost import XGBRegressor
except Exception:
Expand Down Expand Up @@ -34,8 +37,9 @@ def xgbregressor_shape_calculator(operator):
convert_xgboost,
)
# Your data and labels
X = np.random.rand(100, 10)
y = np.random.rand(100, 2)
rng = np.random.default_rng()
X = rng.random((100, 10))
y = rng.random((100, 2))

# Train XGBoost regressor
model = xgboost.XGBRegressor(
Expand All @@ -58,6 +62,62 @@ def xgbregressor_shape_calculator(operator):
got = sess.run(None, {"float_input": X.astype(np.float32)})
self.assertEqual(got[0].shape, (100, 2))

@unittest.skipIf(XGBRegressor is None, "xgboost is not available")
def test_issue_676_values(self):
import onnxruntime
import xgboost
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

rng = np.random.default_rng(0)
X = rng.random((50, 10)).astype(np.float32)
y = rng.random((50, 10))

model = xgboost.XGBRegressor(objective="reg:squarederror", n_estimators=3)
model.fit(X, y)

onnx_model = convert_xgboost(
model, initial_types=[("float_input", FloatTensorType([None, 10]))]
)
sess = onnxruntime.InferenceSession(
onnx_model.SerializeToString(), providers=["CPUExecutionProvider"]
)
got = sess.run(None, {"float_input": X})[0]
expected = model.predict(X)

self.assertEqual(got.shape, (50, 10))
assert_allclose(got, expected, rtol=1e-5, atol=1e-5)

@unittest.skipIf(XGBRegressor is None, "xgboost is not available")
def test_quantile_regression(self):
import onnxruntime
import xgboost
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

rng = np.random.default_rng(0)
X = rng.random((20, 3)).astype(np.float32)
y = rng.random(20)

model = xgboost.XGBRegressor(
objective="reg:quantileerror",
quantile_alpha=[0.1, 0.5, 0.9],
n_estimators=3,
)
model.fit(X, y)

onnx_model = convert_xgboost(
model, initial_types=[("input", FloatTensorType([None, 3]))]
)
sess = onnxruntime.InferenceSession(
onnx_model.SerializeToString(), providers=["CPUExecutionProvider"]
)
got = sess.run(None, {"input": X})[0]
expected = model.predict(X)

self.assertEqual(got.shape, (20, 3))
assert_allclose(got, expected, rtol=1e-5, atol=1e-5)


if __name__ == "__main__":
unittest.main()
Loading