Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions .github/workflows/multi_arch_build_portable_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# 1. foundation (generic) - sysdeps, base
# 2. compiler-runtime (generic) - compiler, runtimes, profiler-core
# 3. math-libs (per-arch) - BLAS, FFT, etc.
# 4. comm-libs (per-arch) - RCCL, rocshmem (parallel to math-libs)
# 4. comm-libs (generic) - RCCL, rocshmem (parallel to math-libs)
# 5. debug-tools (generic) - amd-dbgapi, rocr-debug-agent, rocgdb (parallel to math-libs)
# 6. dctools-core (generic) - RDC (parallel to math-libs)
# 7. profiler-apps (generic) - rocprofiler-systems (parallel to math-libs)
Expand Down Expand Up @@ -111,22 +111,17 @@ jobs:
id-token: write

# ==========================================================================
# STAGE: comm-libs (per-arch, parallel to math-libs)
# STAGE: comm-libs (generic, parallel to math-libs)
# ==========================================================================
comm-libs:
needs: compiler-runtime
if: ${{ !cancelled() && !failure() && !contains(inputs.prebuilt_stages, 'comm-libs') }}
strategy:
fail-fast: false
matrix:
family_info: ${{ fromJSON(inputs.matrix_per_family_json) }}
uses: ./.github/workflows/multi_arch_build_portable_linux_artifacts.yml
secrets: inherit
with:
stage_name: comm-libs
stage_display_name: "Stage - Comm Libs (${{ matrix.family_info.amdgpu_family }})"
stage_display_name: "Stage - Comm Libs"
timeout_minutes: 240 # 4 hours
amdgpu_family: ${{ matrix.family_info.amdgpu_family }}
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
permissions:
Expand Down
9 changes: 5 additions & 4 deletions BUILD_TOPOLOGY.toml
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,8 @@ artifact_groups = ["math-libs", "ml-libs"]
type = "per-arch"

[build_stages.comm-libs]
description = "Communication libraries per architecture (can run parallel to math-libs)"
description = "Communication libraries (can run parallel to math-libs)"
artifact_groups = ["comm-libs"]
type = "per-arch"

[build_stages.debug-tools]
description = "ROCm debug tools"
Expand Down Expand Up @@ -268,10 +267,12 @@ source_sets = ["rocm-libraries", "ml-frameworks"]

[artifact_groups.comm-libs]
description = "Communication libraries"
type = "per-arch"
type = "generic"
artifact_group_deps = ["hip-runtime"]
# TODO: rocm-systems included for projects/hip/VERSION (see CMakeLists.txt)
source_sets = ["comm-libs", "rocm-systems"]
# Limit dist families to Instinct/CDNA GPUs (dcgpu and legacy gfx9xx).
restrict_dist_families_regex = "dcgpu|^gfx9"

[artifact_groups.profiler-core]
description = "Core profiling libraries and annotation support"
Expand Down Expand Up @@ -658,7 +659,7 @@ artifact_deps = ["core-runtime", "core-hip", "base", "sysdeps", "sysdeps-amd-mes
feature_group = "MEDIA_LIBS" # Controlled by THEROCK_ENABLE_MEDIA_LIBS
disable_platforms = ["windows"]

# --- Communication Libraries (per-arch) ---
# --- Communication Libraries ---

[artifacts.rccl]
artifact_group = "comm-libs"
Expand Down
4 changes: 4 additions & 0 deletions build_tools/_therock_utils/build_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class ArtifactGroup:
type: str # "generic" or "per-arch"
artifact_group_deps: List[str] = field(default_factory=list)
source_sets: List[str] = field(default_factory=list)
restrict_dist_families_regex: Optional[str] = None


@dataclass
Expand Down Expand Up @@ -167,6 +168,9 @@ def _load_topology(self):
type=group_data.get("type", "generic"),
artifact_group_deps=group_data.get("artifact_group_deps", []),
source_sets=group_data.get("source_sets", []),
restrict_dist_families_regex=group_data.get(
"restrict_dist_families_regex"
),
)

# Parse artifacts
Expand Down
39 changes: 35 additions & 4 deletions build_tools/configure_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,33 @@ def get_stage_features(
return features


def _filter_dist_families_for_stage(
topology: BuildTopology, stage_name: str, dist_amdgpu_families: str
) -> str:
"""Filter dist_amdgpu_families based on artifact group restrictions.

If any artifact group in the stage has a restrict_dist_families_regex,
only families matching that regex are included. Groups without a
restriction contribute no filter (all families pass through for them).
Multiple restricted groups are OR-ed together.
"""
import re

stage = topology.build_stages.get(stage_name)
if not stage or not dist_amdgpu_families:
return dist_amdgpu_families
patterns = [
g.restrict_dist_families_regex
for name in stage.artifact_groups
if (g := topology.artifact_groups.get(name)) and g.restrict_dist_families_regex
]
if not patterns:
return dist_amdgpu_families
combined = "|".join(f"({p})" for p in patterns)
families = [f for f in dist_amdgpu_families.split(";") if f]
return ";".join(f for f in families if re.search(combined, f))


def generate_cmake_args(
stage_name: str,
amdgpu_families: str,
Expand Down Expand Up @@ -132,10 +159,14 @@ def generate_cmake_args(
if amdgpu_families:
args.append(f"-DTHEROCK_AMDGPU_FAMILIES={amdgpu_families}")

# GPU families for dist targets (all architectures in the distribution)
# Quote the value since it contains semicolons (CMake list separator)
if dist_amdgpu_families:
args.append(f'-DTHEROCK_DIST_AMDGPU_FAMILIES="{dist_amdgpu_families}"')
# GPU families for dist targets (all architectures in the distribution).
# Filtered by any restrict_dist_families_regex on artifact groups in this
# stage. Quote the value since it contains semicolons (CMake list separator).
filtered_dist = _filter_dist_families_for_stage(
topology, stage_name, dist_amdgpu_families
)
if filtered_dist:
args.append(f'-DTHEROCK_DIST_AMDGPU_FAMILIES="{filtered_dist}"')

# Manylinux Python executables for per-Python-version builds
# Quote values since they contain semicolons (CMake list separator)
Expand Down
16 changes: 16 additions & 0 deletions build_tools/github_actions/fetch_test_configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,22 @@ def _get_script_path(script_name: str) -> str:
},
# Architectures that we have multi GPU setup for testing
"multi_gpu": {"linux": ["gfx94X-dcgpu"]},
# rccl is only built for Instinct/CDNA GPUs; exclude RDNA families to
# avoid test failures when those runners gain full test coverage.
# TODO: Consider replacing with an include_family mechanism once this
# mechanism is supported.
"exclude_family": {
"linux": [
"gfx101X-dgpu",
"gfx103X-dgpu",
"gfx110X-all",
"gfx120X-all",
"gfx1150",
"gfx1151",
"gfx1152",
"gfx1153",
]
},
},
# rocprofiler-sdk tests
"rocprofiler-sdk": {
Expand Down
23 changes: 23 additions & 0 deletions build_tools/tests/build_topology_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,29 @@ def test_parse_artifact_groups(self):
runtime = topology.artifact_groups["runtime"]
self.assertEqual(runtime.artifact_group_deps, ["base"])

def test_parse_artifact_group_restrict_dist_families_regex(self):
"""Test parsing restrict_dist_families_regex on artifact groups."""
self.write_topology(
"""
[artifact_groups.comm-libs]
description = "Communication libraries"
type = "generic"
restrict_dist_families_regex = "dcgpu|^gfx9"

[artifact_groups.math-libs]
description = "Math libraries"
type = "per-arch"
"""
)

topology = BuildTopology(self.topology_path)

comm = topology.artifact_groups["comm-libs"]
self.assertEqual(comm.restrict_dist_families_regex, "dcgpu|^gfx9")

math = topology.artifact_groups["math-libs"]
self.assertIsNone(math.restrict_dist_families_regex)

def test_parse_artifacts(self):
"""Test parsing artifacts."""
self.write_topology(
Expand Down
12 changes: 12 additions & 0 deletions comm-libs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ if(THEROCK_ENABLE_RCCL)
set(_rccl_subproject_names)

therock_cmake_subproject_declare(rccl
USE_DIST_AMDGPU_TARGETS
EXTERNAL_SOURCE_DIR "${THEROCK_ROCM_SYSTEMS_SOURCE_DIR}/projects/rccl"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/rccl"
# High latency LTO link of a single library.
Expand Down Expand Up @@ -70,7 +71,13 @@ if(THEROCK_ENABLE_RCCL)
set(_rccl_tests_optional_deps "therock-openmpi")
endif()

# rccl-tests does not use USE_TEST_AMDGPU_TARGETS (which would default to
# all available targets) because it is bundled into the same artifact as
# rccl. Using USE_DIST_AMDGPU_TARGETS keeps both consistent and ensures the
# artifact only contains device code for Instinct/CDNA families, matching
# the restrict_dist_families_regex on the comm-libs artifact group.
therock_cmake_subproject_declare(rccl-tests
USE_DIST_AMDGPU_TARGETS
EXTERNAL_SOURCE_DIR "${THEROCK_ROCM_SYSTEMS_SOURCE_DIR}/projects/rccl-tests"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/rccl-tests"
BACKGROUND_BUILD
Expand Down Expand Up @@ -98,7 +105,12 @@ if(THEROCK_ENABLE_RCCL)
list(APPEND _rccl_subproject_names rccl-tests)
endif(THEROCK_BUILD_TESTING)

# target-neutral: built once for all dist targets in a single generic CI stage.
# RCCL contains HIP device code compiled per-architecture; USE_DIST_AMDGPU_TARGETS
# embeds all dist targets into a single artifact, avoiding per-family CI sharding.
# Remains target-specific in BUILD_TOPOLOGY.toml to preserve kpack splitting support.
therock_provide_artifact(rccl
TARGET_NEUTRAL
DESCRIPTOR artifact-rccl.toml
COMPONENTS
dbg
Expand Down
Loading