spcl · ThrudPrimrose · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -1,13 +1,13 @@
 name: Pauli GPU Tests
 
 on:
-  workflow_dispatch
-  #push:
-  #  branches: [ main, ci-fix ]
-  #pull_request:
-  #  branches: [ main, ci-fix ]
-  #merge_group:
-  #  branches: [ main, ci-fix ]
+  workflow_dispatch:
+  push:
+    branches: [ main, ci-fix ]
+  pull_request:
+    branches: [ main, ci-fix ]
+  merge_group:
+    branches: [ main, ci-fix ]
 
 env:
   CUDACXX: /usr/local/cuda/bin/nvcc

diff --git a/.github/workflows/gpu-experimental-ci.yml b/.github/workflows/gpu-experimental-ci.yml
@@ -0,0 +1,80 @@
+name: Pauli GPU Tests (ExperimentalCUDACodeGen)
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ main, ci-fix ]
+  pull_request:
+    branches: [ main, ci-fix ]
+  merge_group:
+    branches: [ main, ci-fix ]
+
+env:
+  CUDACXX: /usr/local/cuda/bin/nvcc
+  MKLROOT: /opt/intel/oneapi/mkl/latest/
+  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+  # Force the experimental CUDA codegen for every test in this workflow.
+  DACE_compiler_cuda_implementation: experimental
+
+concurrency:
+  group: ${{github.workflow}}-${{github.ref}}
+  cancel-in-progress: true
+
+jobs:
+  test-gpu-experimental:
+    if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')"
+    runs-on: [self-hosted, gpu]
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        submodules: 'recursive'
+    - name: Install dependencies
+      run: |
+        rm -f ~/.dace.conf
+        rm -rf .dacecache tests/.dacecache
+        python -m venv ~/.venv      # create venv so we can use pip
+        source ~/.venv/bin/activate # activate venv
+        python -m pip install --upgrade pip
+        pip install flake8 pytest-xdist coverage
+        pip install mpi4py
+        pip install cupy
+        pip uninstall -y dace
+        pip install -e ".[testing,ml]"
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
+
+    - name: Test dependencies
+      run: |
+        source ~/.venv/bin/activate # activate venv
+        nvidia-smi
+
+    - name: Run pytest GPU (experimental codegen)
+      run: |
+        source ~/.venv/bin/activate # activate venv
+        export DACE_cache=single
+        export PATH=$PATH:/usr/local/cuda/bin  # some test is calling cuobjdump, so it needs to be in path
+        echo "CUDACXX: $CUDACXX"
+        echo "DACE_compiler_cuda_implementation: $DACE_compiler_cuda_implementation"
+        pytest --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "gpu"
+
+    - name: Run extra GPU tests (experimental codegen)
+      run: |
+        source ~/.venv/bin/activate # activate venv
+        export NOSTATUSBAR=1
+        export DACE_cache=single
+        export COVERAGE_RCFILE=`pwd`/.coveragerc
+        export PYTHON_BINARY="coverage run --source=dace --parallel-mode"
+        ./tests/cuda_test.sh
+
+    - name: Report overall coverage
+      run: |
+        source ~/.venv/bin/activate # activate venv
+        export COVERAGE_RCFILE=`pwd`/.coveragerc
+        coverage combine . */; coverage report; coverage xml
+        reachable=0
+        ping -W 2 -c 1 codecov.io || reachable=$?
+        if [ $reachable -eq 0 ]; then
+          ./codecov
+        else
+          echo "Codecov.io is unreachable"
+        fi
diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt
@@ -35,7 +35,8 @@ foreach(DACE_FILE ${DACE_FILES})
   # Make the path absolute
   set(DACE_FILE ${DACE_SRC_DIR}/${DACE_FILE})
   # Now treat the file according to the deduced target
-  if(${DACE_FILE_TARGET} STREQUAL "cuda")
+  # previous: if(${DACE_FILE_TARGET} STREQUAL "cuda"). Needed to work with experimental
+  if(${DACE_FILE_TARGET} STREQUAL "experimental_cuda" OR ${DACE_FILE_TARGET} STREQUAL "cuda")
     if(${DACE_FILE_TARGET_TYPE} MATCHES "hip")
       set(DACE_ENABLE_HIP ON)
       set(DACE_HIP_FILES ${DACE_HIP_FILES} ${DACE_FILE})
@@ -261,13 +262,22 @@ endforeach()
 # Create DaCe library file
 add_library(${DACE_PROGRAM_NAME} SHARED ${DACE_CPP_FILES} ${DACE_OBJECTS})
 target_link_libraries(${DACE_PROGRAM_NAME} PUBLIC ${DACE_LIBS})
+# The OpenMP INTERFACE options don't always propagate through to this target;
+# inject -fopenmp at the front of both compile and link lines so libgomp is
+# considered before -Wl,--as-needed can drop it.
+target_compile_options(${DACE_PROGRAM_NAME} BEFORE PRIVATE ${OpenMP_CXX_FLAGS})
+target_link_options(${DACE_PROGRAM_NAME} BEFORE PRIVATE ${OpenMP_CXX_FLAGS})
 
 # Set C++ standard to C++20 (or the configured standard)
 set_property(TARGET ${DACE_PROGRAM_NAME} PROPERTY CXX_STANDARD ${DACE_CPP_STANDARD})
 
 # Create DaCe loader stub
 add_library(dacestub_${DACE_PROGRAM_NAME} SHARED "${CMAKE_SOURCE_DIR}/tools/dacestub.cpp")
 target_link_libraries(dacestub_${DACE_PROGRAM_NAME} Threads::Threads OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})
+# Same -fopenmp injection as above: dacestub.cpp calls omp_get_max_threads() at
+# load time, so the symbol must be resolved even after --as-needed.
+target_compile_options(dacestub_${DACE_PROGRAM_NAME} BEFORE PRIVATE ${OpenMP_CXX_FLAGS})
+target_link_options(dacestub_${DACE_PROGRAM_NAME} BEFORE PRIVATE ${OpenMP_CXX_FLAGS})
 
 # Windows-specific fixes
 if (MSVC_IDE)

diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
@@ -27,6 +27,7 @@ class DefinedType(attr_enum.ExtensibleAttributeEnum):
     Object = auto()  # An object moved by reference
     Stream = auto()  # A stream object moved by reference and accessed via a push/pop API
     StreamArray = auto()  # An array of Streams
+    GPUStream = auto()  # A backend GPU stream handle (e.g., cudaStream_t / hipStream_t)
 
 
 class DefinedMemlets:
@@ -91,7 +92,8 @@ def add(self, name: str, dtype: DefinedType, ctype: str, ancestor: int = 0, allo
         for _, scope, can_access_parent in reversed(self._scopes):
             if name in scope:
                 err_str = "Shadowing variable {} from type {} to {}".format(name, scope[name], dtype)
-                if (allow_shadowing or config.Config.get_bool("compiler", "allow_shadowing")):
+                if (allow_shadowing or config.Config.get_bool("compiler", "allow_shadowing")
+                        or dtype == DefinedType.GPUStream):
                     if not allow_shadowing:
                         print("WARNING: " + err_str)
                 else:

diff --git a/dace/codegen/instrumentation/gpu_events.py b/dace/codegen/instrumentation/gpu_events.py
@@ -129,7 +129,7 @@ def on_scope_entry(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, n
                                 'GPU_Device map scopes')
 
             idstr = 'b' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
 
     def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.ExitNode,
@@ -139,7 +139,7 @@ def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no
         s = self._get_sobj(node)
         if s.instrument == dtypes.InstrumentationType.GPU_Events:
             idstr = 'e' + self._idstr(cfg, state, entry_node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
             outer_stream.write(self._report('%s %s' % (type(s).__name__, s.label), cfg, state, entry_node), cfg,
                                state_id, node)
@@ -153,7 +153,7 @@ def on_node_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no
         if node.instrument == dtypes.InstrumentationType.GPU_Events:
             state_id = state.parent_graph.node_id(state)
             idstr = 'b' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
 
     def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node,
@@ -165,7 +165,63 @@ def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node
         if node.instrument == dtypes.InstrumentationType.GPU_Events:
             state_id = state.parent_graph.node_id(state)
             idstr = 'e' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
             outer_stream.write(self._report('%s %s' % (type(node).__name__, node.label), cfg, state, node), cfg,
                                state_id, node)
+
+    def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int:
+        """
+        Return the GPU stream ID assigned to a given node.
+
+        - In the CUDACodeGen, the stream ID is stored as the private attribute
+          ``_cuda_stream`` on the node.
+        - In the ExperimentalCUDACodeGen, streams are explicitly assigned to tasklets
+          and GPU_Device-scheduled maps (kernels) via a GPU stream AccessNode. For
+          other node types, no reliable stream assignment is available.
+
+        Parameters
+        ----------
+        state : SDFGState
+            The state containing the node.
+        node : dace.sdfg.nodes.Node
+            The node for which to query the GPU stream.
+
+        Returns
+        -------
+        int
+            The assigned GPU stream ID, or ``-1`` if none could be determined.
+        """
+        if config.Config.get('compiler', 'cuda', 'implementation') == 'legacy':
+            stream = getattr(node, '_cuda_stream', -1)
+            return stream
+
+        def _stream_from_in_edges(target: nodes.Node) -> int:
+            for in_edge in state.in_edges(target):
+                src = in_edge.src
+                if (isinstance(src, nodes.AccessNode) and src.desc(state).dtype == dtypes.gpuStream_t
+                        and not in_edge.data.is_empty()):
+                    return int(in_edge.data.subset)
+            return -1
+
+        stream = _stream_from_in_edges(node)
+
+        # MapExit's out-edge to gpu_streams carries an empty dependency memlet
+        # (see ConnectGPUStreamsToNodes._build_chain). Resolve via the matching
+        # MapEntry, which has the real `gpu_streams[i]` in-edge.
+        if stream == -1 and isinstance(node, nodes.MapExit):
+            entry = state.entry_node(node)
+            if entry is not None:
+                stream = _stream_from_in_edges(entry)
+
+        # Defensive out-edge fallback for non-Exit nodes only (Exit nodes' stream
+        # out-edges are always empty by construction).
+        if stream == -1 and not isinstance(node, nodes.ExitNode):
+            for out_edge in state.out_edges(node):
+                dst = out_edge.dst
+                if (isinstance(dst, nodes.AccessNode) and dst.desc(state).dtype == dtypes.gpuStream_t
+                        and not out_edge.data.is_empty()):
+                    stream = int(out_edge.data.subset)
+                    break
+
+        return stream
diff --git a/dace/codegen/instrumentation/gpu_tx_markers.py b/dace/codegen/instrumentation/gpu_tx_markers.py
@@ -22,15 +22,18 @@ class GPUTXMarkersProvider(InstrumentationProvider):
 
     def __init__(self):
         self.backend = common.get_gpu_backend()
-        # Check if ROCm TX libraries and headers are available
+        # Check if ROCm TX libraries and headers are available. Only meaningful
+        # when the backend is HIP — on a CUDA host that happens to also have
+        # ROCm installed we must not flip into rocTX mode (would suppress
+        # NVTX init markers via the `enable_rocTX` short-circuits below).
         rocm_path = os.getenv('ROCM_PATH', '/opt/rocm')
         roctx_header_paths = [
             os.path.join(rocm_path, 'roctracer/include/roctx.h'),
             os.path.join(rocm_path, 'include/roctracer/roctx.h')
         ]
         roctx_library_path = os.path.join(rocm_path, 'lib', 'libroctx64.so')
-        self.enable_rocTX = any(os.path.isfile(path)
-                                for path in roctx_header_paths) and os.path.isfile(roctx_library_path)
+        self.enable_rocTX = (self.backend == 'hip' and any(os.path.isfile(path) for path in roctx_header_paths)
+                             and os.path.isfile(roctx_library_path))
         self.include_generated = False
         super().__init__()
 
@@ -171,6 +174,34 @@ def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no
             return
         self.print_range_pop(outer_stream)
 
+    def on_node_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node,
+                      outer_stream: CodeIOStream, inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        # Bracket host-side cudaMemcpyAsync tasklets emitted by expanded
+        # CopyLibraryNode instances. These tasklets bypass the legacy
+        # _emit_copy() path that fires on_copy_begin, so without an explicit
+        # hook here the experimental codegen ends up with no `copy_*` ranges.
+        if state.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if not isinstance(node, nodes.Tasklet):
+            return
+        if is_devicelevel_gpu_kernel(sdfg, state, node):
+            return
+        if not node.label.startswith('copy_'):
+            return
+        self.print_range_push(node.label, sdfg, outer_stream)
+
+    def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node,
+                    outer_stream: CodeIOStream, inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        if state.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if not isinstance(node, nodes.Tasklet):
+            return
+        if is_devicelevel_gpu_kernel(sdfg, state, node):
+            return
+        if not node.label.startswith('copy_'):
+            return
+        self.print_range_pop(outer_stream)
+
     def on_sdfg_init_begin(self, sdfg: SDFG, callsite_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
         if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
             return

diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py
@@ -5,3 +5,4 @@
 from .mlir.mlir import MLIRCodeGen
 from .sve.codegen import SVECodeGen
 from .snitch import SnitchCodeGen
+from .experimental_cuda import ExperimentalCUDACodeGen