GridTools · tehrengruber · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 15, 2026
diff --git a/docs/development/dace_codegen_reproducability.md b/docs/development/dace_codegen_reproducability.md
@@ -0,0 +1,95 @@
+# Debugging indeterministic behavior of dace transformations
+
+- Enable printing each transformation step, e.g. using
+  ```
+  dace.Config.set("progress", value=True)
+  ```
+  TODO: introduce new config var that prints the hash instead of hard-coding it.
+- Execute the program in question twice and compare the output.
+- Set a conditonal breakpoint in beginning of the `apply` method of the first pass where the SDFG
+  hash changes with condition `sdfg.hash_sdfg() == <last equal hash>`.
+  Note: In case running the previous passes takes a long time it makes sense to serialize the SDFG
+  to json (`sdfg.to_json("sdfg(1|2).json")`) and loading it again (see debug script below) to
+  ease debugging. In rare cases the serializing and deserializing the sdfg changes the hash. In such
+  cases this trick doesn't work and the first location where the hash changes might not be the exact
+  location where the indeterministic behavior is. It helps to use a different hash, e.g.
+  `content_hash`, but this should be solved in general.
+  Note: It makes sense to also place a breakpoint after `DaceTranslator.generate_sdfg` to recognize
+  when all executions finished.
+- When the location is found it is usually easy to spot the origin of the indeterminism. Often
+  there is a set operation or a symbol is named in an indeterministic way. Use ordered sets and
+  deterministic symbol names.
+
+## Appendix
+
+__Debugging sdfg autooptimize__
+
+Usage `python debug_auto_optimize_sdfg.py sdfg1.json`
+
+```python
+import pickle
+import sys
+
+import dace
+import json
+
+from dace import SDFG
+
+from gt4py.next.program_processors.runners.dace import (
+    lowering as gtx_dace_lowering,
+    sdfg_args as gtx_dace_args,
+    transformations as gtx_transformations,
+)
+from dace.utils import print_sdfg_hash
+
+file = sys.argv[1]
+
+with open(file) as f:
+    data = json.load(f)
+    sdfg = dace.SDFG.from_json(data)
+    print_sdfg_hash(sdfg)
+
+    gtx_transformations.gt_auto_optimize(
+        sdfg,
+        gpu=False,
+        constant_symbols={},
+        unit_strides_kind=None,
+    )
+```
+
+__Debugging single sdfg transform__
+
+Usage `python debug_single_sdfg_transform.py sdfg1.json`
+
+```python
+import pickle
+import sys
+
+import dace
+import json
+
+from dace import SDFG
+
+from gt4py.next.program_processors.runners.dace import (
+    lowering as gtx_dace_lowering,
+    sdfg_args as gtx_dace_args,
+    transformations as gtx_transformations,
+)
+from dace.utils import print_sdfg_hash
+
+transformation = gtx_transformations.MoveDataflowIntoIfBody
+file = sys.argv[1]
+
+with open(file) as f:
+    data = json.load(f)
+    sdfg = dace.SDFG.from_json(data)
+    print_sdfg_hash(sdfg)
+
+    sdfg.apply_transformations_repeated(
+        transformation(
+            ignore_upstream_blocks=False,
+        ),
+        validate=False,
+        validate_all=True,
+    )
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -120,7 +120,8 @@ dependencies = [
   'toolz>=0.12.1',
   'typing-extensions>=4.12.0',
   'versioningit>=3.1.1',
-  'xxhash>=3.5.0'
+  'xxhash>=3.5.0',
+  'ordered-set>=4.0.0'
 ]
 description = 'Python library for generating high-performance implementations of stencil kernels for weather and climate modeling from a domain-specific language (DSL)'
 dynamic = ['version']

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py
@@ -15,6 +15,7 @@
 import dace
 from dace import subsets as dace_subsets
 from dace.sdfg import nodes as dace_nodes
+from ordered_set import OrderedSet
 
 from gt4py.next.program_processors.runners.dace.transformations import (
     splitting_tools as gtx_dace_split,
@@ -78,8 +79,11 @@ def _new_name(old_name: str) -> str:
         elif isinstance(node, dace_nodes.NestedSDFG):
             node_ = graph.add_nested_sdfg(
                 sdfg=copy.deepcopy(node.sdfg),
-                inputs=set(node.in_connectors.keys()),
-                outputs=set(node.out_connectors.keys()),
+                # TODO(tehrengruber): What is the performance optimization from Philip about?
+                #  In any case this here leads to an sdfg in which the order in graph.nodes
+                #  is indeterministic, but to_json, then from_json restores it again.
+                inputs={k: None for k in node.in_connectors.keys()},
+                outputs={k: None for k in node.out_connectors.keys()},
-                inputs={k: None for k in node.in_connectors.keys()},
-                outputs={k: None for k in node.out_connectors.keys()},
+                inputs=node.in_connectors.copy().
+                outputs=node.out_connectors.copy(),
-                inputs={k: None for k in node.in_connectors.keys()},
-                outputs={k: None for k in node.out_connectors.keys()},
+                inputs=node.in_connectors.copy().
+                outputs=node.out_connectors.copy(),
                 symbol_mapping=node.symbol_mapping.copy(),
                 debuginfo=copy.copy(node.debuginfo),
             )
@@ -202,8 +206,10 @@ def split_overlapping_map_range(
         Two lists, each containing the ranges corresponding to the splitted range
         for the first and the second map, respectively.
     """
-    first_map_params = set(first_map.params)
-    second_map_params = set(second_map.params)
+    # TODO(tehrengruber): The structure here looks a little funky. We just use an ordered set for
+    #  now, but likely no sets are needed at all.
+    first_map_params = OrderedSet(first_map.params)
+    second_map_params = OrderedSet(second_map.params)
     if first_map_params != second_map_params:
         return None
 

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/move_dataflow_into_if_body.py b/src/gt4py/next/program_processors/runners/dace/transformations/move_dataflow_into_if_body.py
@@ -24,10 +24,10 @@
     type_inference as dace_type_inference,
     utils as dace_sutils,
 )
+from ordered_set import OrderedSet
 
 from gt4py.next.program_processors.runners.dace import transformations as gtx_transformations
 
-
 @dace_properties.make_properties
 class MoveDataflowIntoIfBody(dace_transformation.SingleStateTransformation):
     """The transformation moves dataflow into the if branches.
@@ -494,8 +494,8 @@ def _remove_outside_dataflow(
 
         The function will also remove data containers that are no longer in use.
         """
-        all_relocatable_dataflow: set[dace_nodes.Node] = functools.reduce(
-            lambda s1, s2: s1.union(s2), relocatable_dataflow.values(), set()
+        all_relocatable_dataflow: OrderedSet[dace_nodes.Node] = functools.reduce(
+            lambda s1, s2: s1.union(s2), relocatable_dataflow.values(), OrderedSet()
         )
 
         # Before we can clean the original nodes, we must clean the dataflow. If a
@@ -525,7 +525,7 @@ def _update_symbol_mapping(
         are available in the parent SDFG.
         """
         symbol_mapping = if_block.symbol_mapping
-        missing_symbols = [ms for ms in if_block.sdfg.free_symbols if ms not in symbol_mapping]
+        missing_symbols = sorted([ms for ms in if_block.sdfg.free_symbols if ms not in symbol_mapping])
         symbol_mapping.update({s: s for s in missing_symbols})
         if_block.symbol_mapping = symbol_mapping  # Performs conversion.
 
@@ -706,10 +706,10 @@ def _filter_relocatable_dataflow(
         sdfg: dace.SDFG,
         state: dace.SDFGState,
         if_block: dace_nodes.NestedSDFG,
-        raw_relocatable_dataflow: dict[str, set[dace_nodes.Node]],
-        non_relocatable_dataflow: dict[str, set[dace_nodes.Node]],
+        raw_relocatable_dataflow: dict[str, OrderedSet[dace_nodes.Node]],
+        non_relocatable_dataflow: dict[str, OrderedSet[dace_nodes.Node]],
         enclosing_map: dace_nodes.MapEntry,
-    ) -> dict[str, set[dace_nodes.Node]]:
+    ) -> dict[str, OrderedSet[dace_nodes.Node]]:
         """Partition the dependencies.
 
         The function expects the dataflow that is upstream of every connector
@@ -734,23 +734,23 @@ def _filter_relocatable_dataflow(
         """
 
         # Remove the parts of the dataflow that is unrelocatable.
-        all_non_relocatable_dataflow: set[dace_nodes.Node] = functools.reduce(
-            lambda s1, s2: s1.union(s2), non_relocatable_dataflow.values(), set()
+        all_non_relocatable_dataflow: OrderedSet[dace_nodes.Node] = functools.reduce(
+            lambda s1, s2: s1.union(s2), non_relocatable_dataflow.values(), OrderedSet()
         )
         relocatable_dataflow = {
             conn_name: rel_df.difference(all_non_relocatable_dataflow)
             for conn_name, rel_df in raw_relocatable_dataflow.items()
         }
 
         # Find the known_nodes for each branch
-        known_nodes: dict[dace.SDFGState, set[dace_nodes.Node]] = dict()
+        known_nodes: dict[dace.SDFGState, OrderedSet[dace_nodes.Node]] = dict()
         for conn_name, rel_df in relocatable_dataflow.items():
             branch_state, _ = self._find_branch_for(if_block=if_block, connector=conn_name)
             if branch_state not in known_nodes:
-                known_nodes[branch_state] = set()
+                known_nodes[branch_state] = OrderedSet()
             known_nodes[branch_state].update(rel_df)
 
-        multiple_df_nodes: set[dace_nodes.Node] = set()
+        multiple_df_nodes: OrderedSet[dace_nodes.Node] = OrderedSet()
         # Find intersect of all known_nodes sets which are the nodes that are in the dataflow
         # of multiple branches and thus doesn't make sense to relocate
         for branch_state, known_nodes_set in known_nodes.items():
@@ -770,8 +770,8 @@ def _filter_relocatable_dataflow(
         #   the data is single use data, is not an AccessNode that refers to global
         #   memory nor is a source AccessNode.
         def filter_nodes(
-            nodes_proposed_for_reloc: set[dace_nodes.Node],
-        ) -> set[dace_nodes.Node]:
+            nodes_proposed_for_reloc: OrderedSet[dace_nodes.Node],
+        ) -> OrderedSet[dace_nodes.Node]:
             has_been_updated = True
             while has_been_updated:
                 has_been_updated = False
@@ -793,7 +793,7 @@ def filter_nodes(
                         for oedge in state.out_edges(reloc_node)
                         if oedge.dst is not if_block
                     ):
-                        nodes_proposed_for_reloc.remove(reloc_node)
+                        nodes_proposed_for_reloc.remove(reloc_node) # TODO(tehrengruber): this is O(N)
                         has_been_updated = True
                         continue
 
@@ -804,14 +804,14 @@ def filter_nodes(
                     #  the `if` body or be mapped (remain outside but made accessible
                     #  inside), thus their relocation state is of no concern for
                     #  `reloc_node`.
-                    non_mappable_incoming_nodes: set[dace_nodes.Node] = {
+                    non_mappable_incoming_nodes: OrderedSet[dace_nodes.Node] = OrderedSet(
                         iedge.src
                         for iedge in state.in_edges(reloc_node)
                         if not (
                             (iedge.src is enclosing_map)
                             or isinstance(iedge.src, dace_nodes.AccessNode)
                         )
-                    }
+                    )
                     if non_mappable_incoming_nodes.issubset(nodes_proposed_for_reloc):
                         # All nodes that can not be mapped into the `if` body are
                         #  currently scheduled to be relocated, thus there is not
@@ -826,7 +826,7 @@ def filter_nodes(
                         #  that none of its input can. Thus we remove them from
                         #  `nodes_proposed_for_reloc`.
                         nodes_proposed_for_reloc.difference_update(non_mappable_incoming_nodes)
-                        nodes_proposed_for_reloc.remove(reloc_node)
+                        nodes_proposed_for_reloc.remove(reloc_node) # TODO(tehrengruber): this is O(N)
                         has_been_updated = True
 
             return nodes_proposed_for_reloc
@@ -838,7 +838,7 @@ def filter_nodes(
     def _partition_if_block(
         self,
         if_block: dace_nodes.NestedSDFG,
-    ) -> Optional[tuple[set[str], set[str]]]:
+    ) -> Optional[tuple[OrderedSet[str], OrderedSet[str]]]:
         """Check if `if_block` can be processed and partition the input connectors.
 
         The function will check if `if_block` has the right structure, i.e. if it is
@@ -858,7 +858,7 @@ def _partition_if_block(
             return None
 
         # These are all the output names.
-        output_names: set[str] = set(if_block.out_connectors.keys())
+        output_names: OrderedSet[str] = OrderedSet(if_block.out_connectors.keys())
 
         # We require that the nested SDFG contains a single node, which is a
         #  `ConditionalBlock` containing two branches.
@@ -899,14 +899,14 @@ def _partition_if_block(
         #  So the ones that can be relocated were found exactly once. Zero would
         #  mean they can not be relocated and more than one means that we do not
         #  support it yet.
-        relocatable_connectors = {
+        relocatable_connectors = OrderedSet(
             conn_name for conn_name, conn_count in reference_count.items() if conn_count == 1
-        }
-        non_relocatable_connectors = {
+        )
+        non_relocatable_connectors = OrderedSet(
             conn_name
             for conn_name in reference_count.keys()
             if conn_name not in relocatable_connectors
-        }
+        )
 
         if len(non_relocatable_connectors) == 0:
             return None

diff --git a/...ogram_processors/runners/dace/transformations/multi_state_global_self_copy_elimination.py b/...ogram_processors/runners/dace/transformations/multi_state_global_self_copy_elimination.py
@@ -94,10 +94,10 @@ def modifies(self) -> dace_ppl.Modifies:
     def should_reapply(self, modified: dace_ppl.Modifies) -> bool:
         return modified & (dace_ppl.Modifies.Memlets | dace_ppl.Modifies.AccessNodes)
 
-    def depends_on(self) -> set[type[dace_transformation.Pass]]:
-        return {
+    def depends_on(self) -> list[type[dace_transformation.Pass]]:
+        return [
             dace_transformation.passes.FindAccessStates,
-        }
+        ]
 
     def apply_pass(
         self, sdfg: dace.SDFG, pipeline_results: dict[str, Any]

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/simplify.py b/src/gt4py/next/program_processors/runners/dace/transformations/simplify.py
@@ -551,11 +551,11 @@ def modifies(self) -> dace_ppl.Modifies:
     def should_reapply(self, modified: dace_ppl.Modifies) -> bool:
         return modified & (dace_ppl.Modifies.Memlets | dace_ppl.Modifies.AccessNodes)
 
-    def depends_on(self) -> set[type[dace_transformation.Pass]]:
-        return {
+    def depends_on(self) -> list[type[dace_transformation.Pass]]:
+        return [
             dace_transformation.passes.StateReachability,
             dace_transformation.passes.FindAccessStates,
-        }
+        ]
 
     def apply_pass(
         self, sdfg: dace.SDFG, pipeline_results: dict[str, Any]
@@ -1008,7 +1008,7 @@ def apply(
         # This is the tasklet that we will put inside the map, note we have to do it
         #  this way to avoid some name clash stuff.
         inner_tasklet: dace_nodes.Tasklet = graph.add_tasklet(
-            name=f"{tasklet.label}__clone_{str(uuid.uuid1()).replace('-', '_')}",
+            name=f"{tasklet.label}__clone_{dace_nodes._get_next_node_id()}",  # TODO: use another global counter
             outputs=tasklet.out_connectors.keys(),
             inputs=set(),
             code=tasklet.code,
@@ -1033,7 +1033,7 @@ def apply(
 
         # Now we will reroute the edges went through the inner map, through the
         #  inner access node instead.
-        for old_inner_edge in list(
+        for old_inner_edge in list(  # TODO(tehrengruber): Why all these list comprehensions everywhere?
             graph.out_edges_by_connector(map_entry, "OUT_" + connector_name)
         ):
             # We now modify the downstream data. This is because we no longer refer
@@ -1157,8 +1157,8 @@ def __init__(
     def expressions(cls) -> Any:
         return [dace.sdfg.utils.node_path_graph(cls.map_exit, cls.tmp_ac, cls.glob_ac)]
 
-    def depends_on(self) -> set[type[dace_transformation.Pass]]:
-        return {dace_transformation.passes.ConsolidateEdges}
+    def depends_on(self) -> list[type[dace_transformation.Pass]]:
+        return [dace_transformation.passes.ConsolidateEdges]
 
     def can_be_applied(
         self,