diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index b13568a0f93..9b19cbc7770 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -1207,9 +1207,9 @@ def _handle(
 
 
 def _partition_specs_by_device(
-    all_specs: set[TensorSpec],
+    all_specs: list[TensorSpec],
     enable_non_cpu_memory_planning: bool,
-) -> dict[tuple[DeviceType, int], set[TensorSpec]]:
+) -> dict[tuple[DeviceType, int], list[TensorSpec]]:
     """Partition specs by (device_type, device_index).
 
     Different device indices on the same device type (e.g. CUDA:0 vs CUDA:1)
@@ -1217,8 +1217,11 @@ def _partition_specs_by_device(
 
     When ``enable_non_cpu_memory_planning`` is False (legacy), all specs are
     placed into a single CPU:0 bucket regardless of their device attribute.
+
+    Insertion order is preserved within each partition because order-sensitive
+    algorithms (e.g. greedy with bisect.insort) rely on it for stable tie-breaking.
     """
-    specs_by_device: dict[tuple[DeviceType, int], set[TensorSpec]] = defaultdict(set)
+    specs_by_device: dict[tuple[DeviceType, int], list[TensorSpec]] = defaultdict(list)
     if not enable_non_cpu_memory_planning:
         specs_by_device[_CPU_KEY] = all_specs
         return specs_by_device
@@ -1227,7 +1230,7 @@ def _partition_specs_by_device(
     has_pre_assigned_mem_id = False
     for spec in all_specs:
         device_key = (spec.device, spec.device_index)
-        specs_by_device[device_key].add(spec)
+        specs_by_device[device_key].append(spec)
         if spec.device != DeviceType.CPU:
             has_non_cpu_specs = True
         if spec.mem_id is not None:
@@ -1308,9 +1311,11 @@ def apply_algo(
     # Extract the nodes and their lifespans from the graph_module
     _ = update_all_tensors_lifetime(graph_module, graph_signature)
 
-    # Collect and materialize specs into a set so we can iterate multiple
-    # times and partition by device.
-    all_specs: set[TensorSpec] = set(
+    # Collect specs into an ordered list so we can iterate multiple times and
+    # partition by device.  Order matters: order-sensitive algorithms (e.g.
+    # greedy with bisect.insort) rely on insertion order for stable tie-breaking,
+    # and `collect_specs_from_nodes` already deduplicates via its `dedup` flag.
+    all_specs: list[TensorSpec] = list(
         collect_specs_from_nodes(
             graph_module.graph.nodes,
             graph_signature,
diff --git a/exir/print_program.py b/exir/print_program.py
index cf2daa2c2d3..c1ec1a0bb8e 100644
--- a/exir/print_program.py
+++ b/exir/print_program.py
@@ -353,16 +353,21 @@ def _format_graph(graph: torch.fx.Graph, offending_node_idx: int) -> str:
 
 def _stacktrace_to_framelist(stacktrace: str) -> FrameList:
     """Creates a frame list from a stacktrace string."""
-    pattern = r'File "(.*?)", line (\d+), in (.*?)\n'
-    matches = re.findall(pattern, stacktrace)
+    # Capture (filename, lineno, name, source-line) in a single regex.  Python
+    # 3.11+ tracebacks may include extra caret/underline lines (e.g. "^^^^")
+    # between frames, so we cannot rely on a fixed line offset; instead we pull
+    # the source line directly out of the line that immediately follows each
+    # `File "...", line N, in <name>` header.
+    pattern = re.compile(r'File "(.*?)", line (\d+), in (.*?)\n([^\n]*)')
+    matches = pattern.findall(stacktrace)
     mapped_frame_list = [
         Frame(
-            filename=match[0],
-            lineno=int(match[1]),
-            name=match[2],
-            context=stacktrace.split("\n")[i * 2 + 1].strip(),
+            filename=m[0],
+            lineno=int(m[1]),
+            name=m[2],
+            context=m[3].strip(),
         )
-        for i, match in enumerate(matches)
+        for m in matches
     ]
     return FrameList(mapped_frame_list)
 
diff --git a/runtime/core/test/device_allocator_test.cpp b/runtime/core/test/device_allocator_test.cpp
index f0bd7c6556e..3c165fcc0d9 100644
--- a/runtime/core/test/device_allocator_test.cpp
+++ b/runtime/core/test/device_allocator_test.cpp
@@ -231,6 +231,9 @@ TEST_F(DeviceAllocatorTest, RegistrySingletonInstance) {
   EXPECT_EQ(&instance1, &instance2);
 }
 
+// EXPECT_DEATH requires gtest death-test support, which is unavailable on
+// platforms without fork() (e.g. iOS).  Skip on those platforms.
+#if GTEST_HAS_DEATH_TEST
 TEST_F(DeviceAllocatorTest, RegisteringSameDeviceTypeTwiceAborts) {
   // The fixture has already registered cuda_allocator() for CUDA; attempting
   // to register a second allocator for the same device type must abort.
@@ -239,3 +242,4 @@ TEST_F(DeviceAllocatorTest, RegisteringSameDeviceTypeTwiceAborts) {
       register_device_allocator(&another_allocator),
       "Allocator already registered");
 }
+#endif // GTEST_HAS_DEATH_TEST