Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.git
.gitignore

.venv
**/.venv
.uv-cache
**/.uv-cache

__pycache__
**/__pycache__
*.pyc

dist
build
*.egg-info

logs
outputs
44 changes: 44 additions & 0 deletions .github/workflows/sdk-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: SDK Package

on:
push:
branches: [main]
paths:
- 'sdk/**'
- 'pyproject.toml'
- '.github/workflows/sdk-package.yml'
pull_request:
paths:
- 'sdk/**'
- 'pyproject.toml'
- '.github/workflows/sdk-package.yml'
workflow_dispatch:

jobs:
build-sdk:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install uv
uses: astral-sh/setup-uv@v6

- name: Build SDK package
run: uv build --package system-intelligence-sdk --wheel --sdist

- name: Verify package metadata
run: uvx twine check dist/system_intelligence_sdk-*

- name: Upload SDK dist artifacts
uses: actions/upload-artifact@v4
with:
name: sdk-dist
path: dist/*
retention-days: 14
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ __pycache__/
*.pyc
.venv/
venv/
build/
dist/
*.egg-info/

# IDE
.vscode/
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ System Intelligence Benchmark currently includes the following example benchmark
- **SDK** (`sdk/`) - Software development kit providing evaluators, LLM interfaces, and utility functions
- **Documentation** (`doc/`) - Guides and documentation for using and contributing to System Intelligence Benchmark

For the canonical repository boundaries and migration direction, see [doc/project_structure.md](doc/project_structure.md).
For SDK packaging and release flow, see [doc/sdk_packaging.md](doc/sdk_packaging.md).

### Prerequisites

- Python 3.9+
Expand Down Expand Up @@ -145,4 +148,3 @@ trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies.

66 changes: 37 additions & 29 deletions benchmarks/arteval_bench/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,34 +1,42 @@
FROM ubuntu:24.04
FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim AS builder

ARG DEBIAN_FRONTEND=noninteractive

USER root
WORKDIR /workspace
COPY . /workspace
RUN mkdir -p /workspace/dist \
&& (uv build --package system-intelligence-sdk --wheel -o /workspace/dist || true) \
&& uv build --all-packages --wheel -o /workspace/dist

WORKDIR /
COPY . .
FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim

RUN rm -rf /var/lib/apt/lists/* \
&& apt-get update -o Acquire::Retries=5 \
&& apt-get install -y --no-install-recommends \
build-essential \
git \
wget \
python3-pip \
python3-venv \
pipx \
ARG DEBIAN_FRONTEND=noninteractive
USER root
RUN apt-get update && apt-get install -y --no-install-recommends git \
&& rm -rf /var/lib/apt/lists/*

# SWE-ReX will always attempt to install its server into your docker container
# however, this takes a couple of seconds. If we already provide it in the image,
# this is much faster.
RUN pipx install swe-rex
RUN pipx ensurepath

ENV PATH="/root/.local/bin:${PATH}"
ENV PATH="/usr/local/go/bin:${PATH}"

SHELL ["/bin/bash", "-c"]

RUN chmod +x install.sh test.sh && ./install.sh

CMD ["bash"]
# Build with repository root as context:
# docker build -f benchmarks/arteval_bench/Dockerfile .
WORKDIR /workspace
COPY . /workspace
COPY --from=builder /workspace/dist/*.whl /tmp/dist/

WORKDIR /workspace/benchmarks/arteval_bench
RUN set -eux; \
SDK_WHEEL="$(ls /tmp/dist/system_intelligence_sdk-*.whl | head -n1 || true)"; \
BENCH_WHEEL="$(ls /tmp/dist/arteval_bench-*.whl | head -n1 || true)"; \
if [ -z "$SDK_WHEEL" ]; then \
echo "Missing SDK wheel in /tmp/dist. Build with repo root context:"; \
echo "docker build -t arteval_bench -f benchmarks/arteval_bench/Dockerfile ."; \
ls -1 /tmp/dist || true; \
exit 1; \
fi; \
if [ -z "$BENCH_WHEEL" ]; then \
echo "Missing arteval_bench wheel in /tmp/dist."; \
ls -1 /tmp/dist || true; \
exit 1; \
fi; \
rm -rf .venv; \
uv venv .venv; \
uv pip install --python .venv/bin/python "$SDK_WHEEL" "$BENCH_WHEEL"; \
.venv/bin/python -c "import importlib.metadata as m; print(m.version('sweagent'))"

CMD ["bash"]
30 changes: 13 additions & 17 deletions benchmarks/arteval_bench/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,24 @@

set -e # Exit immediately on error.

if ! command -v uv >/dev/null 2>&1; then
echo "==> uv not found. Installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
Comment on lines +6 to +8
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The install script auto-installs uv by piping a remote shell script from the network into sh. That pattern is a supply-chain risk and also makes installs non-reproducible in locked-down environments. Prefer documenting a manual uv installation step (or at least prompting for confirmation / verifying a pinned installer checksum) instead of executing a remote script automatically.

Suggested change
echo "==> uv not found. Installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
echo "==> Error: 'uv' command not found."
echo "Please install 'uv' manually before running this script."
echo "For installation instructions, see: https://docs.astral.sh/uv/getting-started/installation/"
exit 1

Copilot uses AI. Check for mistakes.
fi

REPO_ROOT="$(git rev-parse --show-toplevel 2>/dev/null || pwd)"
export UV_CACHE_DIR="${UV_CACHE_DIR:-${REPO_ROOT}/.uv-cache}"

# if .venv does not exist, create it
if [ -d ".venv" ]; then
echo "==> .venv already exists, skipping creation."
else
echo "==> Creating .venv directory..."

python3 -m venv .venv
source .venv/bin/activate

if [ ! -d "SWE-agent" ]; then
echo "==> Install SWE-agent and its dependencies..."
git clone https://github.com/SWE-agent/SWE-agent.git
cd SWE-agent
git checkout 0c27f286303a939aa868ad2003bc4b6776771791
pip install --editable .
sweagent --help
cd ..
else
echo "==> SWE-agent repository already exists, skipping clone."
fi

deactivate
uv venv .venv
fi

uv sync --extra dev
uv run --no-sync sweagent --help >/dev/null

echo "==> ArtEvalBench environment is set up successfully."
28 changes: 28 additions & 0 deletions benchmarks/arteval_bench/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[project]
name = "arteval-bench"
version = "0.1.0"
description = "ArtEval benchmark package"
requires-python = ">=3.9"
dependencies = [
"system-intelligence-sdk>=0.1.0",
"requests",
"azure-identity",
"sweagent @ git+https://github.com/SWE-agent/SWE-agent.git@v1.1.0",
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sweagent dependency is brought in via a Git URL pinned only to a tag (v1.1.0), which is mutable and can be retargeted to arbitrary commits. If the SWE-agent repository or its release tags are compromised, future installs/builds could transparently pull and execute attacker-controlled code in environments that hold API keys or other secrets. Prefer pinning this dependency to an immutable commit SHA (or a verified release artifact) so that the exact code version being executed cannot be changed without explicitly updating this configuration.

Copilot uses AI. Check for mistakes.
]
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR description says arteval-bench was updated to require Python >=3.11 due to dependency constraints, but this pyproject.toml still declares requires-python = ">=3.9". Align the metadata with the actual dependency requirements (or adjust dependencies) to avoid installs that succeed initially but fail at runtime.

Copilot uses AI. Check for mistakes.

[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"ruff>=0.6.0",
]

[build-system]
requires = ["uv_build>=0.10.4,<0.11.0"]
build-backend = "uv_build"

[tool.uv.build-backend]
module-name = "src"
module-root = ""

[tool.uv.sources]
system-intelligence-sdk = { workspace = true }
16 changes: 9 additions & 7 deletions benchmarks/arteval_bench/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,22 @@ NEW_MODEL_NAME="${MODEL_NAME//\//_}"
# export OPENAI_BASE_URL="http://localhost:2327/v1"
# export OPENAI_API_KEY="EMPTY"

source .venv/bin/activate
if [ ! -x ".venv/bin/python" ]; then
echo "==> .venv is missing. Run ./install.sh first."
exit 1
fi

echo "==> Start to run ArtEvalBench"
# Note that if you benchmark has multiple tasks, you need to add --task <task>
# in your code to enable task selection.
# sweagent --help
# python src/main.py \
# python src/core/main.py \
# --task "test"
# --save_path "./outputs/systemcourseproject__${NEW_MODEL_NAME}__$(date +"%Y-%m-%d_%H-%M-%S")" \

python src/main_setup.py
# --model "$MODEL_NAME" \
uv run --no-sync python src/core/main.py \
--model_name "${MODEL_NAME}"
# --save_path "./outputs/systemcourseproject__${NEW_MODEL_NAME}__$(date +"%Y-%m-%d_%H-%M-%S")" \

# python src/main_setup.py \
# uv run --no-sync python src/core/main.py \
# --input_json "./data/benchmark/course_lab_task_examples.jsonl"

deactivate
1 change: 1 addition & 0 deletions benchmarks/arteval_bench/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""ArtEval benchmark package."""
3 changes: 0 additions & 3 deletions benchmarks/arteval_bench/src/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@
import argparse
import json
import os
import sys
from datetime import datetime

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))

from sdk.logger import logger
from sdk.utils import set_llm_endpoint_from_config

Expand Down
3 changes: 0 additions & 3 deletions benchmarks/arteval_bench/src/core/main_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@
import argparse
import json
import os
import sys
from datetime import datetime

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))

from sdk.logger import logger
from sdk.utils import set_llm_endpoint_from_config

Expand Down
3 changes: 0 additions & 3 deletions benchmarks/arteval_bench/src/core/run_eval_in_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@

import asyncio
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))

from swerex.deployment.docker import DockerDeploymentConfig
from swerex.runtime.abstract import BashAction, Command, CreateBashSessionRequest, UploadRequest
Expand Down
5 changes: 2 additions & 3 deletions benchmarks/arteval_bench/src/core/run_eval_sweagent.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import sys
import json
import os
import subprocess

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))

from patch_evaluator import pacth_eval
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

run_eval_sweagent.py is inside the core package, but it imports a sibling module with from patch_evaluator import .... This relies on running the file as a script (or having src/core on sys.path) and can break once the benchmark is used as an installed package. Switch to an explicit relative import from the same package to make this workspace/packaging refactor fully sys.path-hack-free.

Suggested change
from patch_evaluator import pacth_eval
from .patch_evaluator import pacth_eval

Copilot uses AI. Check for mistakes.

from sdk.logger import logger
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/arteval_bench/src/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
"""Helper methods for running tests in a deployment."""

import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))

from sdk.logger import logger

Expand All @@ -17,4 +14,4 @@ def get_task(file_path):
+ f" means you can directly proceed with executing the steps in the README"
+ f" without asking for approval or confirmation. Once you rached the end"
+ f" of the README you must exit the Docker image gracefully.")
return task
return task
1 change: 1 addition & 0 deletions benchmarks/arteval_bench/src/evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""ArtEval evaluator package."""
48 changes: 35 additions & 13 deletions benchmarks/cache_algo_bench/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,36 @@
FROM ubuntu:24.04

WORKDIR /usr/src
COPY . .
RUN apt-get update && apt-get install -y \
build-essential \
git \
wget \
python3-pip \
python3-venv

RUN chmod +x install.sh test.sh && ./install.sh

FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim AS builder

WORKDIR /workspace
COPY . /workspace
RUN mkdir -p /workspace/dist \
&& (uv build --package system-intelligence-sdk --wheel -o /workspace/dist || true) \
&& uv build --all-packages --wheel -o /workspace/dist

FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim

# Build with repository root as context:
# docker build -f benchmarks/cache_algo_bench/Dockerfile .
WORKDIR /workspace
COPY . /workspace
COPY --from=builder /workspace/dist/*.whl /tmp/dist/

WORKDIR /workspace/benchmarks/cache_algo_bench
RUN set -eux; \
SDK_WHEEL="$(ls /tmp/dist/system_intelligence_sdk-*.whl | head -n1 || true)"; \
BENCH_WHEEL="$(ls /tmp/dist/cache_algo_bench-*.whl | head -n1 || true)"; \
if [ -z "$SDK_WHEEL" ]; then \
echo "Missing SDK wheel in /tmp/dist. Build with repo root context:"; \
echo "docker build -t cache_algo_bench -f benchmarks/cache_algo_bench/Dockerfile ."; \
ls -1 /tmp/dist || true; \
exit 1; \
fi; \
if [ -z "$BENCH_WHEEL" ]; then \
echo "Missing cache_algo_bench wheel in /tmp/dist."; \
ls -1 /tmp/dist || true; \
exit 1; \
fi; \
rm -rf .venv; \
uv venv .venv; \
uv pip install --python .venv/bin/python "$SDK_WHEEL" "$BENCH_WHEEL"

# ENTRYPOINT ["./test.sh"]
Loading
Loading