NOAA-GFDL · FlorianDeconinck · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 15, 2026
diff --git a/external/dace b/external/dace
diff --git a/external/gt4py b/external/gt4py
diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
@@ -786,7 +786,7 @@ def __init__(
                 "Communicator needs to be instantiated with communication subsystem"
                 f" derived from `comm_abc.Comm`, got {type(comm)}."
             )
-        if comm.Get_size() != partitioner.total_ranks:
+        if comm.Get_size() < partitioner.total_ranks:
             raise ValueError(
                 f"was given a partitioner for {partitioner.total_ranks} ranks but a "
                 f"comm object with only {comm.Get_size()} ranks, are we running "

diff --git a/ndsl/config/backend.py b/ndsl/config/backend.py
@@ -52,6 +52,8 @@ class BackendLoopOrder(Enum):
     "orch:dace:cpu:KJI": "dace:cpu_KJI",
     "st:dace:gpu:KJI": "dace:gpu",
     "orch:dace:gpu:KJI": "dace:gpu",
+    "st:dace:gpu:IJK": "dace:gpu_IJK",
+    "orch:dace:gpu:IJK": "dace:gpu_IJK",
 }
 """Internal: match the NDSL backend names with the GT4Py names"""
 

diff --git a/ndsl/dsl/caches/cache_location.py b/ndsl/dsl/caches/cache_location.py
@@ -7,46 +7,48 @@ def identify_code_path(
     partitioner: Partitioner,
     single_code_path: bool,
 ) -> FV3CodePath:
-    """Determine which code path your rank will hit.
+    """
+    Determine which code path your rank will hit.
 
-    If single_code_path is True, single_code_path is True,
-    only one code path exists (case of doubly periodic grid).
+    If single_code_path is True, only one code path exists,
+    e.g. in case of a doubly periodic grid.
     If single_code_path is False, we are in the case of the
-    cube-sphere and we will look at our position on the tile."""
+    cube-sphere and we will look at our position on the tile.
+    """
 
     # Doubly-periodic or single tile grid
-    if single_code_path:
+    if single_code_path or partitioner.layout == (1, 1):
         return FV3CodePath.All
 
     # Cube-sphere
-    if partitioner.layout == (1, 1):
-        return FV3CodePath.All
-    elif partitioner.layout[0] == 1 or partitioner.layout[1] == 1:
+    if partitioner.layout[0] <= 1 or partitioner.layout[1] <= 1:
         raise NotImplementedError(
-            f"Build for layout {partitioner.layout} is not handled"
+            f"Build for layout {partitioner.layout} is not handled."
         )
-    else:
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return FV3CodePath.BottomLeft
-            if partitioner.tile.on_tile_right(rank):
-                return FV3CodePath.BottomRight
-            else:
-                return FV3CodePath.Bottom
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return FV3CodePath.TopLeft
-            if partitioner.tile.on_tile_right(rank):
-                return FV3CodePath.TopRight
-            else:
-                return FV3CodePath.Top
-        else:
-            if partitioner.tile.on_tile_left(rank):
-                return FV3CodePath.Left
-            if partitioner.tile.on_tile_right(rank):
-                return FV3CodePath.Right
-            else:
-                return FV3CodePath.Center
+
+    # Bottom row
+    if partitioner.tile.on_tile_bottom(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return FV3CodePath.BottomLeft
+        if partitioner.tile.on_tile_right(rank):
+            return FV3CodePath.BottomRight
+        return FV3CodePath.Bottom
+
+    # Top row
+    if partitioner.tile.on_tile_top(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return FV3CodePath.TopLeft
+        if partitioner.tile.on_tile_right(rank):
+            return FV3CodePath.TopRight
+        return FV3CodePath.Top
+
+    # Left & right column with corners already handled
+    if partitioner.tile.on_tile_left(rank):
+        return FV3CodePath.Left
+    if partitioner.tile.on_tile_right(rank):
+        return FV3CodePath.Right
+
+    return FV3CodePath.Center
 
 
 def get_cache_fullpath(code_path: FV3CodePath) -> str:

diff --git a/ndsl/dsl/caches/codepath.py b/ndsl/dsl/caches/codepath.py
@@ -3,10 +3,12 @@
 
 class FV3CodePath(enum.Enum):
     """Enum listing all possible code paths on a cube sphere.
+
     For any layout the cube sphere has up to 9 different code paths depending on
     the positioning of the rank on the tile and which of the edge/corner cases
     it has to handle, as well as the possibility for all boundary computations in
     the 1x1 layout case.
+
     Since the framework inlines code to optimize, we _cannot_ pre-suppose which code
     being kept and/or ejected. This enum serves as the ground truth to map rank to
     the proper generated code.

diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py
@@ -10,14 +10,20 @@
 from gt4py.cartesian.utils.compiler import cxx_compiler_defaults, gpu_configuration
 
 from ndsl import LocalComm
+from ndsl.comm import Comm
 from ndsl.comm.communicator import Communicator
 from ndsl.comm.partitioner import Partitioner
 from ndsl.config import Backend
 from ndsl.dsl import NDSL_GLOBAL_PRECISION
 from ndsl.dsl.caches.cache_location import identify_code_path
 from ndsl.dsl.caches.codepath import FV3CodePath
+from ndsl.dsl.dace.hardware_config import get_gpu_hardware_defaults
 from ndsl.optional_imports import cupy as cp
-from ndsl.performance.collector import NullPerformanceCollector, PerformanceCollector
+from ndsl.performance.collector import (
+    AbstractPerformanceCollector,
+    NullPerformanceCollector,
+    PerformanceCollector,
+)
 
 
 if TYPE_CHECKING:
@@ -166,8 +172,8 @@ def __init__(
         Args:
             communicator: used for setting the distributed caches
             backend: string for the backend
-            tile_nx: x/y domain size for a single time
-            tile_nz: z domain size for a single time
+            tile_nx: x/y domain size for a single tile
+            tile_nz: z domain size for a single tile
             orchestration: orchestration mode from DaCeOrchestration
             time: trigger performance collection, available to user with
                 `performance_collector`
@@ -181,16 +187,12 @@ def __init__(
         # ToDo: DaceConfig becomes a bit more than a read-only config
         #       with this. Should be refactored into a DaceExecutor carrying a config
         self.loaded_dace_executables: DaceExecutables = {}
-        self.performance_collector = (
-            PerformanceCollector(
-                "InternalOrchestrationTimer",
-                comm=(
-                    LocalComm(0, 6, {}) if communicator is None else communicator.comm
-                ),
+        if not time:
+            self.performance_collector: AbstractPerformanceCollector = (
+                NullPerformanceCollector()
             )
-            if time
-            else NullPerformanceCollector()
-        )
+        else:
+            self.set_timer(communicator.comm if communicator else None)
 
         # Temporary. This is a bit too out of the ordinary for the common user.
         # We should refactor the architecture to allow for a `gtc:orchestrated:dace:X`
@@ -264,21 +266,29 @@ def __init__(
             march_option = "-mcpu=native" if is_arm_neoverse else "-march=native"
             # Removed --fast-math
             gpu_config = gpu_configuration(GT4PY_COMPILE_OPT_LEVEL)
+            gpu_cflags = " ".join(gpu_config.gpu_compile_flags).strip()
             dace.config.Config.set(
                 "compiler",
                 "cuda",
                 "args",
-                value=f"-std=c++14 -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_config.gpu_compile_flags}",
+                value=f"-std=c++14 -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_cflags}",
+            )
+
+            # Target compilation for hardware micro-code capacities
+            gpu_defaults = get_gpu_hardware_defaults()
+            dace.config.Config.set(
+                "compiler",
+                "cuda",
+                "cuda_arch",
+                value=f"{gpu_defaults.compute_capability}",
             )
 
-            cuda_sm = cp.cuda.Device(0).compute_capability if cp else 60
-            dace.config.Config.set("compiler", "cuda", "cuda_arch", value=f"{cuda_sm}")
-            # Block size/thread count is defaulted to an average value for recent
-            # hardware (Pascal and upward). The problem of setting an optimized
-            # block/thread is both hardware and problem dependant. Fine tuners
-            # available in DaCe should be relied on for further tuning of this value.
+            # Default block size for kernels launch
             dace.config.Config.set(
-                "compiler", "cuda", "default_block_size", value="64,8,1"
+                "compiler",
+                "cuda",
+                "default_block_size",
+                value=str(gpu_defaults.block_size)[1:-1],
             )
             # Potentially buggy - deactivate
             dace.config.Config.set(
@@ -412,4 +422,20 @@ def from_dict(cls, data: dict) -> Self:
         config.rank_size = data["rank_size"]
         config.layout = data["layout"]
         config.tile_resolution = data["tile_resolution"]
-        return config
+        # TODO
+        # Computed properties like `self.code_path` and `self.do_compile`
+        # aren't updated.
+        # We also don't `set_distributed_caches()` based on that updated
+        # information.
+        raise NotImplementedError(
+            "Implementation of `DaceConfig.from_dict()` is incomplete."
+        )
+
+    def set_timer(self, comm: Comm | None) -> None:
+        """Set timer on configuration externally"""
+        # TODO: this absolutely should not be a on a Configuration object
+        #      and even less setup outside. Madness, we have lost our ways...
+        self.performance_collector = PerformanceCollector(
+            "InternalOrchestrationTimer",
+            comm=(LocalComm(0, 6, {}) if comm is None else comm),
+        )
diff --git a/ndsl/dsl/dace/hardware_config.py b/ndsl/dsl/dace/hardware_config.py
@@ -0,0 +1,119 @@
+import dataclasses
+import os
+import sys
+
+from ndsl import ndsl_log
+from ndsl.optional_imports import cupy as cp
+
+
+# Taken straight out of https://pcisig.com/membership/member-companies
+_VENDOR_PCI_SIGNAURES = {
+    0x10DE: "Nvidia",
+    0x1002: "AMD",
+    0x8086: "Intel",
+    0x0: "Unknown",
+}
+
+# Cached copy of the hardware default
+_GPU_HARDWARE_DEFAULTS = None
+
+
+def _get_vendor() -> str:
+    """Retrieve vendor using the current device PCI id to query the PCI vendor
+    from the kernel logs
+
+    ⚠️ Only works on Linux - kicks back to "Unknwon" in other cases
+    """
+    if not sys.platform.startswith("linux"):
+        return _VENDOR_PCI_SIGNAURES[0x0]
+
+    pci_device_id = cp.cuda.runtime.deviceGetPCIBusId(0)
+    dev_path = f"/sys/bus/pci/devices/{pci_device_id}"
+    if not os.path.exists(dev_path):
+        return "Unknown"
+
+    with open(os.path.join(dev_path, "vendor"), "r") as f:
+        vendor_str = f.read().strip().replace("0x", "")
+        vendor_id = int(vendor_str, 16)
+
+    if vendor_id not in _VENDOR_PCI_SIGNAURES:
+        ndsl_log.error(f"Unknown GPU vendor with PCI-SIG ID of {vendor_id:#X}")
+        return "Unknown"
+    return _VENDOR_PCI_SIGNAURES[int(vendor_str, 16)]
+
+
+@dataclasses.dataclass
+class GPUHardwareDefaults:
+    """Compute defaults for common GPUs"""
+
+    vendor: str
+    block_size: list[int] = dataclasses.field(default_factory=list)
+    compute_capability: int = -1  # Nvidia specific
+
+
+def get_gpu_hardware_defaults() -> GPUHardwareDefaults:
+    """Retrieve default values for GPU computation configuration"""
+    global _GPU_HARDWARE_DEFAULTS
+    if _GPU_HARDWARE_DEFAULTS is not None:
+        return _GPU_HARDWARE_DEFAULTS  # type: ignore[unreachable]
+
+    if not cp or not cp.cuda.is_available():
+        ndsl_log.warning("No cupy - defaulting for GPU hardware")
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor="Unknown",
+            block_size=[
+                8,
+                1,
+                1,
+            ],  # Smaller common denominator of massively parallel hardware
+        )
+        return _GPU_HARDWARE_DEFAULTS
+
+    # Who goes there
+    vendor = _get_vendor()
+    if vendor == "Nvidia":
+        compute_capability = int(cp.cuda.Device(0).compute_capability)
+        # Default block size based on compute capability
+        if compute_capability > 80:
+            # Covers:
+            #  - Blackwell (100+)
+            #  - Hopper (90-100)
+            #  - Ampere (80-90)
+            block_sizes = [128, 1, 1]
+        elif compute_capability > 60:
+            # Covers:
+            #  - Volta (70-80)
+            #  - Pascal (60-70)
+            block_sizes = [64, 8, 1]
+        else:
+            # For older hardware - we default to the safe warp-size since
+            # the dawn of GPGPU on Nvidia hardware
+            block_sizes = [32, 1, 1]
+
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor,
+            block_size=block_sizes,
+            compute_capability=compute_capability,
+        )
+    elif vendor == "AMD":
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor, block_size=[64, 1, 1]  # Default RDNA architectue is Wave64
+        )
+    elif vendor == "Intel":
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor,
+            block_size=[32, 1, 1],  # Intel can run 8, 16 or 32 - but SIMD betters in 32
+        )
+    else:
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor,
+            block_size=[
+                8,
+                1,
+                1,
+            ],  # Smaller common denominator of massively parallel hardware
+        )
+
+    ndsl_log.info(f"GPU vendor detected: {_GPU_HARDWARE_DEFAULTS.vendor}")
+
+    return _GPU_HARDWARE_DEFAULTS
+1 −1		.github/workflows/general-ci.yml
+1 −1		.github/workflows/ml-ci.yml
+4 −1		dace/codegen/compiled_sdfg.py
+36 −32		dace/codegen/control_flow.py
+1 −1		dace/codegen/instrumentation/papi.py
+5 −2		dace/codegen/targets/cuda.py
+11 −10		dace/codegen/targets/framecode.py
+1 −1		dace/config.py
+4 −1		dace/data/ctypes_interop.py
+2 −2		dace/dtypes.py
+4 −1		dace/frontend/python/newast.py
+1 −0		dace/frontend/python/replacements/array_creation.py
+1 −0		dace/frontend/python/replacements/array_manipulation.py
+1 −0		dace/frontend/python/replacements/array_metadata.py
+1 −0		dace/frontend/python/replacements/linalg.py
+3 −2		dace/frontend/python/replacements/misc.py
+1 −0		dace/frontend/python/replacements/reduction.py
+1 −0		dace/frontend/python/replacements/ufunc.py
+1 −1		dace/memlet.py
+114 −4		dace/runtime/include/dace/math.h
+114 −47		dace/sdfg/analysis/schedule_tree/tree_to_sdfg.py
+2 −2		dace/sdfg/analysis/schedule_tree/treenodes.py
+1 −1		dace/sdfg/analysis/vector_inference.py
+1 −0		dace/sdfg/infer_types.py
+10 −3		dace/sdfg/nodes.py
+296 −97		dace/sdfg/propagation.py
+94 −14		dace/sdfg/sdfg.py
+203 −18		dace/sdfg/state.py
+3 −0		dace/sdfg/utils.py
+35 −35		dace/subsets.py
+1 −0		dace/transformation/dataflow/add_threadblock_map.py
+69 −22		dace/transformation/dataflow/map_fission.py
+38 −23		dace/transformation/dataflow/redundant_array.py
+1 −1		dace/transformation/dataflow/sve/infer_types.py
+155 −47		dace/transformation/helpers.py
+12 −1		dace/transformation/interstate/state_fusion_with_happens_before.py
+7 −5		dace/transformation/passes/analysis/analysis.py
+511 −0		dace/transformation/passes/loop_to_reduce.py
+8 −15		dace/transformation/passes/reference_reduction.py
+32 −1		doc/conf.py
+218 −0		doc/extensions/backend.rst
+9 −8		doc/extensions/extensions.rst
+129 −0		doc/extensions/frontend.rst
+113 −0		doc/extensions/instrumentation.rst
+393 −0		doc/extensions/libraries.rst
+111 −0		doc/extensions/sdfgconvertible.rst
+133 −0		doc/extensions/symbolic.rst
+1 −1		doc/frontend/daceprograms.rst
+4 −6		doc/frontend/parsing.rst
+129 −0		doc/frontend/preprocessing.rst
+56 −14		doc/frontend/pysupport.rst
+1 −3		doc/frontend/python.rst
+51 −0		doc/general/faq.rst
+1 −1		doc/general/glossary.rst
+1 −1		doc/general/structure.rst
+8 −2		doc/index.rst
+84 −0		doc/optimization/guidelines.rst
+112 −0		doc/optimization/interactive.rst
+3 −5		doc/optimization/optimization.rst
+1 −4		doc/sdfg/ir.rst
+177 −0		doc/sdfg/schedule_tree.rst
+1 −1		doc/setup/integration.rst
+1 −1		doc/setup/quickstart.rst
+2 −2		doc/source/dace.cli.rst
+1 −1		doc/source/dace.codegen.instrumentation.rst
+0 −9		doc/source/dace.rst
+1 −1		tests/codegen/allocation_lifetime_test.py
+2 −1		tests/codegen/control_flow_generation_test.py
+27 −0		tests/codegen/gpu_min_warps_per_eu_test.py
+2 −1		tests/graph_test.py
+134 −0		tests/memlet_propagation_squeezing_test.py
+1 −1		tests/numpy/common.py
+406 −0		tests/passes/loop_to_reduce_test.py
+26 −21		tests/passes/writeset_underapproximation_test.py
+2 −2		tests/schedule_tree/naming_test.py
+3 −4		tests/schedule_tree/schedule_test.py
+142 −1		tests/schedule_tree/to_sdfg_test.py
+17 −10		tests/sdfg/data/container_array_test.py
+50 −37		tests/sdfg/data/structure_test.py
+29 −4		tests/sdfg/reference_test.py
+1 −1		tests/state_transition_test.py
+105 −0		tests/transformations/helpers_test.py
+6 −6		tests/transformations/loop_to_map_test.py
+147 −1		tests/transformations/map_fission_test.py
+1 −1		tutorials/getting_started.ipynb
+22 −0		CHANGELOG.md
+2 −4		README.md
+0 −28		ci/cscs-ci-ext-config.yml
+2 −3		ci/cscs-ci.yml
+3 −3		pyproject.toml
+1 −1		src/gt4py/__about__.py
+1 −1		src/gt4py/_core/filecache.py
+33 −7		src/gt4py/cartesian/backend/dace_backend.py
+14 −14		src/gt4py/cartesian/gtc/dace/oir_to_tasklet.py
+1 −3		src/gt4py/cartesian/gtc/dace/oir_to_treeir.py
+2 −2		src/gt4py/cartesian/utils/compiler.py
+12 −8		src/gt4py/next/otf/compilation/cache.py
+24 −0		src/gt4py/next/program_processors/runners/dace/library_nodes/__init__.py
+175 −0		src/gt4py/next/program_processors/runners/dace/library_nodes/reduce_with_skip_values.py
+62 −152		src/gt4py/next/program_processors/runners/dace/lowering/gtir_dataflow.py
+56 −26		src/gt4py/next/program_processors/runners/dace/lowering/gtir_to_sdfg.py
+4 −1		src/gt4py/next/program_processors/runners/dace/lowering/gtir_to_sdfg_scan.py
+15 −3		src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
+2 −2		src/gt4py/next/program_processors/runners/dace/transformations/inline_fuser.py
+6 −2		src/gt4py/next/program_processors/runners/dace/transformations/loop_blocking.py
+5 −2		src/gt4py/next/program_processors/runners/dace/transformations/utils.py
+7 −1		src/gt4py/next/program_processors/runners/dace/workflow/factory.py
+4 −2		src/gt4py/next/program_processors/runners/gtfn.py
+9 −0		src/gt4py/storage/cartesian/layout_registry.py
+6 −4		tests/cartesian_tests/integration_tests/feature_tests/test_field_layouts.py
+29 −10		tests/cartesian_tests/integration_tests/multi_feature_tests/test_code_generation.py
+1 −4		tests/cartesian_tests/integration_tests/multi_feature_tests/test_suites.py
+19 −0		tests/cartesian_tests/unit_tests/test_gtc/dace/test_oir_to_tasklet.py
+1 −1		tests/next_tests/unit_tests/otf_tests/compilation_tests/build_systems_tests/test_compiledb.py
+13 −13		uv.lock