diff --git a/.github/workflows/multi_arch_build_portable_linux.yml b/.github/workflows/multi_arch_build_portable_linux.yml index af66ca5b64..b09688ad4e 100644 --- a/.github/workflows/multi_arch_build_portable_linux.yml +++ b/.github/workflows/multi_arch_build_portable_linux.yml @@ -7,7 +7,7 @@ # 1. foundation (generic) - sysdeps, base # 2. compiler-runtime (generic) - compiler, runtimes, profiler-core # 3. math-libs (per-arch) - BLAS, FFT, etc. -# 4. comm-libs (per-arch) - RCCL, rocshmem (parallel to math-libs) +# 4. comm-libs (generic) - RCCL, rocshmem (parallel to math-libs) # 5. debug-tools (generic) - amd-dbgapi, rocr-debug-agent, rocgdb (parallel to math-libs) # 6. dctools-core (generic) - RDC (parallel to math-libs) # 7. profiler-apps (generic) - rocprofiler-systems (parallel to math-libs) @@ -111,22 +111,17 @@ jobs: id-token: write # ========================================================================== - # STAGE: comm-libs (per-arch, parallel to math-libs) + # STAGE: comm-libs (generic, parallel to math-libs) # ========================================================================== comm-libs: needs: compiler-runtime if: ${{ !cancelled() && !failure() && !contains(inputs.prebuilt_stages, 'comm-libs') }} - strategy: - fail-fast: false - matrix: - family_info: ${{ fromJSON(inputs.matrix_per_family_json) }} uses: ./.github/workflows/multi_arch_build_portable_linux_artifacts.yml secrets: inherit with: stage_name: comm-libs - stage_display_name: "Stage - Comm Libs (${{ matrix.family_info.amdgpu_family }})" + stage_display_name: "Stage - Comm Libs" timeout_minutes: 240 # 4 hours - amdgpu_family: ${{ matrix.family_info.amdgpu_family }} dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} permissions: diff --git a/BUILD_TOPOLOGY.toml b/BUILD_TOPOLOGY.toml index 4ee972d62e..fdb2c9930d 100644 --- a/BUILD_TOPOLOGY.toml +++ b/BUILD_TOPOLOGY.toml @@ -153,9 +153,8 @@ artifact_groups = ["math-libs", "ml-libs"] type = "per-arch" [build_stages.comm-libs] -description = "Communication libraries per architecture (can run parallel to math-libs)" +description = "Communication libraries (can run parallel to math-libs)" artifact_groups = ["comm-libs"] -type = "per-arch" [build_stages.debug-tools] description = "ROCm debug tools" @@ -268,10 +267,12 @@ source_sets = ["rocm-libraries", "ml-frameworks"] [artifact_groups.comm-libs] description = "Communication libraries" -type = "per-arch" +type = "generic" artifact_group_deps = ["hip-runtime"] # TODO: rocm-systems included for projects/hip/VERSION (see CMakeLists.txt) source_sets = ["comm-libs", "rocm-systems"] +# Limit dist families to Instinct/CDNA GPUs (dcgpu and legacy gfx9xx). +restrict_dist_families_regex = "dcgpu|^gfx9" [artifact_groups.profiler-core] description = "Core profiling libraries and annotation support" @@ -658,7 +659,7 @@ artifact_deps = ["core-runtime", "core-hip", "base", "sysdeps", "sysdeps-amd-mes feature_group = "MEDIA_LIBS" # Controlled by THEROCK_ENABLE_MEDIA_LIBS disable_platforms = ["windows"] -# --- Communication Libraries (per-arch) --- +# --- Communication Libraries --- [artifacts.rccl] artifact_group = "comm-libs" diff --git a/build_tools/_therock_utils/build_topology.py b/build_tools/_therock_utils/build_topology.py index cbc38e7d3a..bd7d3a53d8 100644 --- a/build_tools/_therock_utils/build_topology.py +++ b/build_tools/_therock_utils/build_topology.py @@ -79,6 +79,7 @@ class ArtifactGroup: type: str # "generic" or "per-arch" artifact_group_deps: List[str] = field(default_factory=list) source_sets: List[str] = field(default_factory=list) + restrict_dist_families_regex: Optional[str] = None @dataclass @@ -167,6 +168,9 @@ def _load_topology(self): type=group_data.get("type", "generic"), artifact_group_deps=group_data.get("artifact_group_deps", []), source_sets=group_data.get("source_sets", []), + restrict_dist_families_regex=group_data.get( + "restrict_dist_families_regex" + ), ) # Parse artifacts diff --git a/build_tools/configure_stage.py b/build_tools/configure_stage.py index 53ad94735d..e90ad906ae 100644 --- a/build_tools/configure_stage.py +++ b/build_tools/configure_stage.py @@ -97,6 +97,33 @@ def get_stage_features( return features +def _filter_dist_families_for_stage( + topology: BuildTopology, stage_name: str, dist_amdgpu_families: str +) -> str: + """Filter dist_amdgpu_families based on artifact group restrictions. + + If any artifact group in the stage has a restrict_dist_families_regex, + only families matching that regex are included. Groups without a + restriction contribute no filter (all families pass through for them). + Multiple restricted groups are OR-ed together. + """ + import re + + stage = topology.build_stages.get(stage_name) + if not stage or not dist_amdgpu_families: + return dist_amdgpu_families + patterns = [ + g.restrict_dist_families_regex + for name in stage.artifact_groups + if (g := topology.artifact_groups.get(name)) and g.restrict_dist_families_regex + ] + if not patterns: + return dist_amdgpu_families + combined = "|".join(f"({p})" for p in patterns) + families = [f for f in dist_amdgpu_families.split(";") if f] + return ";".join(f for f in families if re.search(combined, f)) + + def generate_cmake_args( stage_name: str, amdgpu_families: str, @@ -132,10 +159,14 @@ def generate_cmake_args( if amdgpu_families: args.append(f"-DTHEROCK_AMDGPU_FAMILIES={amdgpu_families}") - # GPU families for dist targets (all architectures in the distribution) - # Quote the value since it contains semicolons (CMake list separator) - if dist_amdgpu_families: - args.append(f'-DTHEROCK_DIST_AMDGPU_FAMILIES="{dist_amdgpu_families}"') + # GPU families for dist targets (all architectures in the distribution). + # Filtered by any restrict_dist_families_regex on artifact groups in this + # stage. Quote the value since it contains semicolons (CMake list separator). + filtered_dist = _filter_dist_families_for_stage( + topology, stage_name, dist_amdgpu_families + ) + if filtered_dist: + args.append(f'-DTHEROCK_DIST_AMDGPU_FAMILIES="{filtered_dist}"') # Manylinux Python executables for per-Python-version builds # Quote values since they contain semicolons (CMake list separator) diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index edeee3e639..b4f757e8f1 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -322,6 +322,22 @@ def _get_script_path(script_name: str) -> str: }, # Architectures that we have multi GPU setup for testing "multi_gpu": {"linux": ["gfx94X-dcgpu"]}, + # rccl is only built for Instinct/CDNA GPUs; exclude RDNA families to + # avoid test failures when those runners gain full test coverage. + # TODO: Consider replacing with an include_family mechanism once this + # mechanism is supported. + "exclude_family": { + "linux": [ + "gfx101X-dgpu", + "gfx103X-dgpu", + "gfx110X-all", + "gfx120X-all", + "gfx1150", + "gfx1151", + "gfx1152", + "gfx1153", + ] + }, }, # rocprofiler-sdk tests "rocprofiler-sdk": { diff --git a/build_tools/tests/build_topology_test.py b/build_tools/tests/build_topology_test.py index 8bf96cc63e..d851fcd4c0 100644 --- a/build_tools/tests/build_topology_test.py +++ b/build_tools/tests/build_topology_test.py @@ -116,6 +116,29 @@ def test_parse_artifact_groups(self): runtime = topology.artifact_groups["runtime"] self.assertEqual(runtime.artifact_group_deps, ["base"]) + def test_parse_artifact_group_restrict_dist_families_regex(self): + """Test parsing restrict_dist_families_regex on artifact groups.""" + self.write_topology( + """ + [artifact_groups.comm-libs] + description = "Communication libraries" + type = "generic" + restrict_dist_families_regex = "dcgpu|^gfx9" + + [artifact_groups.math-libs] + description = "Math libraries" + type = "per-arch" + """ + ) + + topology = BuildTopology(self.topology_path) + + comm = topology.artifact_groups["comm-libs"] + self.assertEqual(comm.restrict_dist_families_regex, "dcgpu|^gfx9") + + math = topology.artifact_groups["math-libs"] + self.assertIsNone(math.restrict_dist_families_regex) + def test_parse_artifacts(self): """Test parsing artifacts.""" self.write_topology( diff --git a/comm-libs/CMakeLists.txt b/comm-libs/CMakeLists.txt index 4b983a2652..d3b7b888bd 100644 --- a/comm-libs/CMakeLists.txt +++ b/comm-libs/CMakeLists.txt @@ -27,6 +27,7 @@ if(THEROCK_ENABLE_RCCL) set(_rccl_subproject_names) therock_cmake_subproject_declare(rccl + USE_DIST_AMDGPU_TARGETS EXTERNAL_SOURCE_DIR "${THEROCK_ROCM_SYSTEMS_SOURCE_DIR}/projects/rccl" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/rccl" # High latency LTO link of a single library. @@ -70,7 +71,13 @@ if(THEROCK_ENABLE_RCCL) set(_rccl_tests_optional_deps "therock-openmpi") endif() + # rccl-tests does not use USE_TEST_AMDGPU_TARGETS (which would default to + # all available targets) because it is bundled into the same artifact as + # rccl. Using USE_DIST_AMDGPU_TARGETS keeps both consistent and ensures the + # artifact only contains device code for Instinct/CDNA families, matching + # the restrict_dist_families_regex on the comm-libs artifact group. therock_cmake_subproject_declare(rccl-tests + USE_DIST_AMDGPU_TARGETS EXTERNAL_SOURCE_DIR "${THEROCK_ROCM_SYSTEMS_SOURCE_DIR}/projects/rccl-tests" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/rccl-tests" BACKGROUND_BUILD @@ -98,7 +105,12 @@ if(THEROCK_ENABLE_RCCL) list(APPEND _rccl_subproject_names rccl-tests) endif(THEROCK_BUILD_TESTING) + # target-neutral: built once for all dist targets in a single generic CI stage. + # RCCL contains HIP device code compiled per-architecture; USE_DIST_AMDGPU_TARGETS + # embeds all dist targets into a single artifact, avoiding per-family CI sharding. + # Remains target-specific in BUILD_TOPOLOGY.toml to preserve kpack splitting support. therock_provide_artifact(rccl + TARGET_NEUTRAL DESCRIPTOR artifact-rccl.toml COMPONENTS dbg