Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
cd90f5b
Fix `ReplaceAxisSymbol` and keep it to Taskslets -> `ReplaceAxisSymbo…
FlorianDeconinck May 14, 2026
2c8f74e
Add `TreeOptimizationStatistics` to capture the results of the opt at…
FlorianDeconinck May 14, 2026
8b49e3b
Add a master `CartesianMerge` bringing everything axis merge, refacto…
FlorianDeconinck May 14, 2026
c8d05af
Move helpers into `common` and break them by type
FlorianDeconinck May 15, 2026
20665a8
Fix imports
FlorianDeconinck May 15, 2026
c8a225e
`InlineVertical2DWrite` + utest
FlorianDeconinck May 15, 2026
73f5609
Fix InlineVertical2DWrite
FlorianDeconinck May 15, 2026
fc1ecb1
cleanup
romanc May 18, 2026
d7e40aa
fix symbol replacement
romanc May 18, 2026
55ad8fa
update gt4py (log10 and friends)
romanc May 18, 2026
1c9bb5f
more cleanup (all minor nothing fancy)
romanc May 18, 2026
376a187
Merge remote-tracking branch 'origin/develop' into opt_cycle_I/loop_m…
romanc May 20, 2026
36204b0
Add support for InlineOffgridConditionals
romanc May 20, 2026
689ab89
fixup: temp fix for test of InlineOffgridConditionals
romanc May 20, 2026
c263116
cleanup: remove old "push if down" codepath
romanc May 20, 2026
7d6ecc1
Normalize cartesian index during data depedancy check
FlorianDeconinck May 20, 2026
de03d34
Update tests
FlorianDeconinck May 20, 2026
ff57227
ReplaceAxisSymbolInTasklet -> ReplaceAxisSymbol + debug of it's usage
FlorianDeconinck May 20, 2026
94b2e99
fix unit test by hardinging detection of "our" loops
romanc May 21, 2026
3a50577
unrelated cleanup: fix/assert type issues
romanc May 21, 2026
d6824f3
Changes to `InlineVertical2DWrite`
romanc May 21, 2026
454fb44
dace update: connect source/sink nodes with empty memlets
romanc May 22, 2026
9ba2664
dace update: support for self-assigning copy nodes
romanc May 22, 2026
f8798a0
GPU tree orchestration pipeline
FlorianDeconinck May 22, 2026
89294d2
Add scalarized array to tree statistics
FlorianDeconinck May 22, 2026
456b5fb
Replace `AxisSymbol` in "masklet as well + rename file
FlorianDeconinck May 22, 2026
0aaa78d
Deactivate `InlineVertical2DWrite` for now
FlorianDeconinck May 22, 2026
634a097
Lint
FlorianDeconinck May 22, 2026
02102af
Fix tests after collapsing maps / fix non-cartesian loop inline
romanc May 26, 2026
43674af
fixes to run GFLD_1M with orch:dace:cpu:KJI backend
romanc May 28, 2026
1ea1314
ci: gt4py update (restore temp dace working branch)
romanc May 29, 2026
fa60dcc
remove extra `f` in result report header
romanc May 29, 2026
78d8487
Merge remote-tracking branch 'origin/develop' into opt_cycle_I/loop_m…
romanc May 29, 2026
160923f
unrelated dace/gt4py update: just test fixes and a typo
romanc Jun 1, 2026
c2bd78d
Expose `gpu:IJK` backends to NDSL
FlorianDeconinck Jun 2, 2026
eaaa0cc
Disable DaceConfig.from_dict() as it is incomplete
romanc Jun 3, 2026
749f8a3
readability of cache location code
romanc Jun 3, 2026
84fc75c
Merge branch 'opt_cycle_I/loop_merge' of github.com:FlorianDeconinck/…
romanc Jun 3, 2026
0d860bf
translate tests: fix crash in reporting when comparing scalars
romanc Jun 4, 2026
5ee3bb9
Weaken the cube-sphere communicator hard ranks limit. We need "at lea…
FlorianDeconinck Jun 5, 2026
da82ede
Adjust `cflags` format read for orchestrated compile
FlorianDeconinck Jun 5, 2026
64dd47c
Lint
FlorianDeconinck Jun 5, 2026
26fb0ef
Introduce hardware configuration good defaults
FlorianDeconinck Jun 6, 2026
7bdd3fa
Fix double load for compiling rank
FlorianDeconinck Jun 6, 2026
0fcd9bd
Hardware default: gives back default when no `cp` instead of raising
FlorianDeconinck Jun 6, 2026
d843a2c
Orch: always collapse maps to maximize the kernel parallel basis
FlorianDeconinck Jun 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion external/dace
Submodule dace updated 85 files
+1 −1 .github/workflows/general-ci.yml
+1 −1 .github/workflows/ml-ci.yml
+4 −1 dace/codegen/compiled_sdfg.py
+36 −32 dace/codegen/control_flow.py
+1 −1 dace/codegen/instrumentation/papi.py
+5 −2 dace/codegen/targets/cuda.py
+11 −10 dace/codegen/targets/framecode.py
+1 −1 dace/config.py
+4 −1 dace/data/ctypes_interop.py
+2 −2 dace/dtypes.py
+4 −1 dace/frontend/python/newast.py
+1 −0 dace/frontend/python/replacements/array_creation.py
+1 −0 dace/frontend/python/replacements/array_manipulation.py
+1 −0 dace/frontend/python/replacements/array_metadata.py
+1 −0 dace/frontend/python/replacements/linalg.py
+3 −2 dace/frontend/python/replacements/misc.py
+1 −0 dace/frontend/python/replacements/reduction.py
+1 −0 dace/frontend/python/replacements/ufunc.py
+1 −1 dace/memlet.py
+114 −4 dace/runtime/include/dace/math.h
+114 −47 dace/sdfg/analysis/schedule_tree/tree_to_sdfg.py
+2 −2 dace/sdfg/analysis/schedule_tree/treenodes.py
+1 −1 dace/sdfg/analysis/vector_inference.py
+1 −0 dace/sdfg/infer_types.py
+10 −3 dace/sdfg/nodes.py
+296 −97 dace/sdfg/propagation.py
+94 −14 dace/sdfg/sdfg.py
+203 −18 dace/sdfg/state.py
+3 −0 dace/sdfg/utils.py
+35 −35 dace/subsets.py
+1 −0 dace/transformation/dataflow/add_threadblock_map.py
+69 −22 dace/transformation/dataflow/map_fission.py
+38 −23 dace/transformation/dataflow/redundant_array.py
+1 −1 dace/transformation/dataflow/sve/infer_types.py
+155 −47 dace/transformation/helpers.py
+12 −1 dace/transformation/interstate/state_fusion_with_happens_before.py
+7 −5 dace/transformation/passes/analysis/analysis.py
+511 −0 dace/transformation/passes/loop_to_reduce.py
+8 −15 dace/transformation/passes/reference_reduction.py
+32 −1 doc/conf.py
+218 −0 doc/extensions/backend.rst
+9 −8 doc/extensions/extensions.rst
+129 −0 doc/extensions/frontend.rst
+113 −0 doc/extensions/instrumentation.rst
+393 −0 doc/extensions/libraries.rst
+111 −0 doc/extensions/sdfgconvertible.rst
+133 −0 doc/extensions/symbolic.rst
+1 −1 doc/frontend/daceprograms.rst
+4 −6 doc/frontend/parsing.rst
+129 −0 doc/frontend/preprocessing.rst
+56 −14 doc/frontend/pysupport.rst
+1 −3 doc/frontend/python.rst
+51 −0 doc/general/faq.rst
+1 −1 doc/general/glossary.rst
+1 −1 doc/general/structure.rst
+8 −2 doc/index.rst
+84 −0 doc/optimization/guidelines.rst
+112 −0 doc/optimization/interactive.rst
+3 −5 doc/optimization/optimization.rst
+1 −4 doc/sdfg/ir.rst
+177 −0 doc/sdfg/schedule_tree.rst
+1 −1 doc/setup/integration.rst
+1 −1 doc/setup/quickstart.rst
+2 −2 doc/source/dace.cli.rst
+1 −1 doc/source/dace.codegen.instrumentation.rst
+0 −9 doc/source/dace.rst
+1 −1 tests/codegen/allocation_lifetime_test.py
+2 −1 tests/codegen/control_flow_generation_test.py
+27 −0 tests/codegen/gpu_min_warps_per_eu_test.py
+2 −1 tests/graph_test.py
+134 −0 tests/memlet_propagation_squeezing_test.py
+1 −1 tests/numpy/common.py
+406 −0 tests/passes/loop_to_reduce_test.py
+26 −21 tests/passes/writeset_underapproximation_test.py
+2 −2 tests/schedule_tree/naming_test.py
+3 −4 tests/schedule_tree/schedule_test.py
+142 −1 tests/schedule_tree/to_sdfg_test.py
+17 −10 tests/sdfg/data/container_array_test.py
+50 −37 tests/sdfg/data/structure_test.py
+29 −4 tests/sdfg/reference_test.py
+1 −1 tests/state_transition_test.py
+105 −0 tests/transformations/helpers_test.py
+6 −6 tests/transformations/loop_to_map_test.py
+147 −1 tests/transformations/map_fission_test.py
+1 −1 tutorials/getting_started.ipynb
2 changes: 1 addition & 1 deletion external/gt4py
Submodule gt4py updated 30 files
+22 −0 CHANGELOG.md
+2 −4 README.md
+0 −28 ci/cscs-ci-ext-config.yml
+2 −3 ci/cscs-ci.yml
+3 −3 pyproject.toml
+1 −1 src/gt4py/__about__.py
+1 −1 src/gt4py/_core/filecache.py
+33 −7 src/gt4py/cartesian/backend/dace_backend.py
+14 −14 src/gt4py/cartesian/gtc/dace/oir_to_tasklet.py
+1 −3 src/gt4py/cartesian/gtc/dace/oir_to_treeir.py
+2 −2 src/gt4py/cartesian/utils/compiler.py
+12 −8 src/gt4py/next/otf/compilation/cache.py
+24 −0 src/gt4py/next/program_processors/runners/dace/library_nodes/__init__.py
+175 −0 src/gt4py/next/program_processors/runners/dace/library_nodes/reduce_with_skip_values.py
+62 −152 src/gt4py/next/program_processors/runners/dace/lowering/gtir_dataflow.py
+56 −26 src/gt4py/next/program_processors/runners/dace/lowering/gtir_to_sdfg.py
+4 −1 src/gt4py/next/program_processors/runners/dace/lowering/gtir_to_sdfg_scan.py
+15 −3 src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
+2 −2 src/gt4py/next/program_processors/runners/dace/transformations/inline_fuser.py
+6 −2 src/gt4py/next/program_processors/runners/dace/transformations/loop_blocking.py
+5 −2 src/gt4py/next/program_processors/runners/dace/transformations/utils.py
+7 −1 src/gt4py/next/program_processors/runners/dace/workflow/factory.py
+4 −2 src/gt4py/next/program_processors/runners/gtfn.py
+9 −0 src/gt4py/storage/cartesian/layout_registry.py
+6 −4 tests/cartesian_tests/integration_tests/feature_tests/test_field_layouts.py
+29 −10 tests/cartesian_tests/integration_tests/multi_feature_tests/test_code_generation.py
+1 −4 tests/cartesian_tests/integration_tests/multi_feature_tests/test_suites.py
+19 −0 tests/cartesian_tests/unit_tests/test_gtc/dace/test_oir_to_tasklet.py
+1 −1 tests/next_tests/unit_tests/otf_tests/compilation_tests/build_systems_tests/test_compiledb.py
+13 −13 uv.lock
2 changes: 1 addition & 1 deletion ndsl/comm/communicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ def __init__(
"Communicator needs to be instantiated with communication subsystem"
f" derived from `comm_abc.Comm`, got {type(comm)}."
)
if comm.Get_size() != partitioner.total_ranks:
if comm.Get_size() < partitioner.total_ranks:
raise ValueError(
f"was given a partitioner for {partitioner.total_ranks} ranks but a "
f"comm object with only {comm.Get_size()} ranks, are we running "
Expand Down
2 changes: 2 additions & 0 deletions ndsl/config/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class BackendLoopOrder(Enum):
"orch:dace:cpu:KJI": "dace:cpu_KJI",
"st:dace:gpu:KJI": "dace:gpu",
"orch:dace:gpu:KJI": "dace:gpu",
"st:dace:gpu:IJK": "dace:gpu_IJK",
"orch:dace:gpu:IJK": "dace:gpu_IJK",
}
"""Internal: match the NDSL backend names with the GT4Py names"""

Expand Down
64 changes: 33 additions & 31 deletions ndsl/dsl/caches/cache_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,48 @@ def identify_code_path(
partitioner: Partitioner,
single_code_path: bool,
) -> FV3CodePath:
"""Determine which code path your rank will hit.
"""
Determine which code path your rank will hit.

If single_code_path is True, single_code_path is True,
only one code path exists (case of doubly periodic grid).
If single_code_path is True, only one code path exists,
e.g. in case of a doubly periodic grid.
If single_code_path is False, we are in the case of the
cube-sphere and we will look at our position on the tile."""
cube-sphere and we will look at our position on the tile.
"""

# Doubly-periodic or single tile grid
if single_code_path:
if single_code_path or partitioner.layout == (1, 1):
return FV3CodePath.All

# Cube-sphere
if partitioner.layout == (1, 1):
return FV3CodePath.All
elif partitioner.layout[0] == 1 or partitioner.layout[1] == 1:
if partitioner.layout[0] <= 1 or partitioner.layout[1] <= 1:
raise NotImplementedError(
f"Build for layout {partitioner.layout} is not handled"
f"Build for layout {partitioner.layout} is not handled."
)
else:
if partitioner.tile.on_tile_bottom(rank):
if partitioner.tile.on_tile_left(rank):
return FV3CodePath.BottomLeft
if partitioner.tile.on_tile_right(rank):
return FV3CodePath.BottomRight
else:
return FV3CodePath.Bottom
if partitioner.tile.on_tile_top(rank):
if partitioner.tile.on_tile_left(rank):
return FV3CodePath.TopLeft
if partitioner.tile.on_tile_right(rank):
return FV3CodePath.TopRight
else:
return FV3CodePath.Top
else:
if partitioner.tile.on_tile_left(rank):
return FV3CodePath.Left
if partitioner.tile.on_tile_right(rank):
return FV3CodePath.Right
else:
return FV3CodePath.Center

# Bottom row
if partitioner.tile.on_tile_bottom(rank):
if partitioner.tile.on_tile_left(rank):
return FV3CodePath.BottomLeft
if partitioner.tile.on_tile_right(rank):
return FV3CodePath.BottomRight
return FV3CodePath.Bottom

# Top row
if partitioner.tile.on_tile_top(rank):
if partitioner.tile.on_tile_left(rank):
return FV3CodePath.TopLeft
if partitioner.tile.on_tile_right(rank):
return FV3CodePath.TopRight
return FV3CodePath.Top

# Left & right column with corners already handled
if partitioner.tile.on_tile_left(rank):
return FV3CodePath.Left
if partitioner.tile.on_tile_right(rank):
return FV3CodePath.Right

return FV3CodePath.Center


def get_cache_fullpath(code_path: FV3CodePath) -> str:
Expand Down
2 changes: 2 additions & 0 deletions ndsl/dsl/caches/codepath.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

class FV3CodePath(enum.Enum):
"""Enum listing all possible code paths on a cube sphere.

For any layout the cube sphere has up to 9 different code paths depending on
the positioning of the rank on the tile and which of the edge/corner cases
it has to handle, as well as the possibility for all boundary computations in
the 1x1 layout case.

Since the framework inlines code to optimize, we _cannot_ pre-suppose which code
being kept and/or ejected. This enum serves as the ground truth to map rank to
the proper generated code.
Expand Down
68 changes: 47 additions & 21 deletions ndsl/dsl/dace/dace_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,20 @@
from gt4py.cartesian.utils.compiler import cxx_compiler_defaults, gpu_configuration

from ndsl import LocalComm
from ndsl.comm import Comm
from ndsl.comm.communicator import Communicator
from ndsl.comm.partitioner import Partitioner
from ndsl.config import Backend
from ndsl.dsl import NDSL_GLOBAL_PRECISION
from ndsl.dsl.caches.cache_location import identify_code_path
from ndsl.dsl.caches.codepath import FV3CodePath
from ndsl.dsl.dace.hardware_config import get_gpu_hardware_defaults
from ndsl.optional_imports import cupy as cp
from ndsl.performance.collector import NullPerformanceCollector, PerformanceCollector
from ndsl.performance.collector import (
AbstractPerformanceCollector,
NullPerformanceCollector,
PerformanceCollector,
)


if TYPE_CHECKING:
Expand Down Expand Up @@ -166,8 +172,8 @@ def __init__(
Args:
communicator: used for setting the distributed caches
backend: string for the backend
tile_nx: x/y domain size for a single time
tile_nz: z domain size for a single time
tile_nx: x/y domain size for a single tile
tile_nz: z domain size for a single tile
orchestration: orchestration mode from DaCeOrchestration
time: trigger performance collection, available to user with
`performance_collector`
Expand All @@ -181,16 +187,12 @@ def __init__(
# ToDo: DaceConfig becomes a bit more than a read-only config
# with this. Should be refactored into a DaceExecutor carrying a config
self.loaded_dace_executables: DaceExecutables = {}
self.performance_collector = (
PerformanceCollector(
"InternalOrchestrationTimer",
comm=(
LocalComm(0, 6, {}) if communicator is None else communicator.comm
),
if not time:
self.performance_collector: AbstractPerformanceCollector = (
NullPerformanceCollector()
)
if time
else NullPerformanceCollector()
)
else:
self.set_timer(communicator.comm if communicator else None)

# Temporary. This is a bit too out of the ordinary for the common user.
# We should refactor the architecture to allow for a `gtc:orchestrated:dace:X`
Expand Down Expand Up @@ -264,21 +266,29 @@ def __init__(
march_option = "-mcpu=native" if is_arm_neoverse else "-march=native"
# Removed --fast-math
gpu_config = gpu_configuration(GT4PY_COMPILE_OPT_LEVEL)
gpu_cflags = " ".join(gpu_config.gpu_compile_flags).strip()
dace.config.Config.set(
"compiler",
"cuda",
"args",
value=f"-std=c++14 -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_config.gpu_compile_flags}",
value=f"-std=c++14 -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_cflags}",
)

# Target compilation for hardware micro-code capacities
gpu_defaults = get_gpu_hardware_defaults()
dace.config.Config.set(
"compiler",
"cuda",
"cuda_arch",
value=f"{gpu_defaults.compute_capability}",
)

cuda_sm = cp.cuda.Device(0).compute_capability if cp else 60
dace.config.Config.set("compiler", "cuda", "cuda_arch", value=f"{cuda_sm}")
# Block size/thread count is defaulted to an average value for recent
# hardware (Pascal and upward). The problem of setting an optimized
# block/thread is both hardware and problem dependant. Fine tuners
# available in DaCe should be relied on for further tuning of this value.
# Default block size for kernels launch
dace.config.Config.set(
"compiler", "cuda", "default_block_size", value="64,8,1"
"compiler",
"cuda",
"default_block_size",
value=str(gpu_defaults.block_size)[1:-1],
)
# Potentially buggy - deactivate
dace.config.Config.set(
Expand Down Expand Up @@ -412,4 +422,20 @@ def from_dict(cls, data: dict) -> Self:
config.rank_size = data["rank_size"]
config.layout = data["layout"]
config.tile_resolution = data["tile_resolution"]
return config
# TODO
# Computed properties like `self.code_path` and `self.do_compile`
# aren't updated.
# We also don't `set_distributed_caches()` based on that updated
# information.
raise NotImplementedError(
"Implementation of `DaceConfig.from_dict()` is incomplete."
)

def set_timer(self, comm: Comm | None) -> None:
"""Set timer on configuration externally"""
# TODO: this absolutely should not be a on a Configuration object
# and even less setup outside. Madness, we have lost our ways...
self.performance_collector = PerformanceCollector(
"InternalOrchestrationTimer",
comm=(LocalComm(0, 6, {}) if comm is None else comm),
)
119 changes: 119 additions & 0 deletions ndsl/dsl/dace/hardware_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import dataclasses
import os
import sys

from ndsl import ndsl_log
from ndsl.optional_imports import cupy as cp


# Taken straight out of https://pcisig.com/membership/member-companies
_VENDOR_PCI_SIGNAURES = {
0x10DE: "Nvidia",
0x1002: "AMD",
0x8086: "Intel",
0x0: "Unknown",
}

# Cached copy of the hardware default
_GPU_HARDWARE_DEFAULTS = None


def _get_vendor() -> str:
"""Retrieve vendor using the current device PCI id to query the PCI vendor
from the kernel logs

⚠️ Only works on Linux - kicks back to "Unknwon" in other cases
"""
if not sys.platform.startswith("linux"):
return _VENDOR_PCI_SIGNAURES[0x0]

pci_device_id = cp.cuda.runtime.deviceGetPCIBusId(0)
dev_path = f"/sys/bus/pci/devices/{pci_device_id}"
if not os.path.exists(dev_path):
return "Unknown"

with open(os.path.join(dev_path, "vendor"), "r") as f:
vendor_str = f.read().strip().replace("0x", "")
vendor_id = int(vendor_str, 16)

if vendor_id not in _VENDOR_PCI_SIGNAURES:
ndsl_log.error(f"Unknown GPU vendor with PCI-SIG ID of {vendor_id:#X}")
return "Unknown"
return _VENDOR_PCI_SIGNAURES[int(vendor_str, 16)]


@dataclasses.dataclass
class GPUHardwareDefaults:
"""Compute defaults for common GPUs"""

vendor: str
block_size: list[int] = dataclasses.field(default_factory=list)
compute_capability: int = -1 # Nvidia specific


def get_gpu_hardware_defaults() -> GPUHardwareDefaults:
"""Retrieve default values for GPU computation configuration"""
global _GPU_HARDWARE_DEFAULTS
if _GPU_HARDWARE_DEFAULTS is not None:
return _GPU_HARDWARE_DEFAULTS # type: ignore[unreachable]

if not cp or not cp.cuda.is_available():
ndsl_log.warning("No cupy - defaulting for GPU hardware")
_GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
vendor="Unknown",
block_size=[
8,
1,
1,
], # Smaller common denominator of massively parallel hardware
)
return _GPU_HARDWARE_DEFAULTS

# Who goes there
vendor = _get_vendor()
if vendor == "Nvidia":
compute_capability = int(cp.cuda.Device(0).compute_capability)
# Default block size based on compute capability
if compute_capability > 80:
# Covers:
# - Blackwell (100+)
# - Hopper (90-100)
# - Ampere (80-90)
block_sizes = [128, 1, 1]
elif compute_capability > 60:
# Covers:
# - Volta (70-80)
# - Pascal (60-70)
block_sizes = [64, 8, 1]
else:
# For older hardware - we default to the safe warp-size since
# the dawn of GPGPU on Nvidia hardware
block_sizes = [32, 1, 1]

_GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
vendor=vendor,
block_size=block_sizes,
compute_capability=compute_capability,
)
elif vendor == "AMD":
_GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
vendor=vendor, block_size=[64, 1, 1] # Default RDNA architectue is Wave64
)
elif vendor == "Intel":
_GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
vendor=vendor,
block_size=[32, 1, 1], # Intel can run 8, 16 or 32 - but SIMD betters in 32
)
else:
_GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
vendor=vendor,
block_size=[
8,
1,
1,
], # Smaller common denominator of massively parallel hardware
)

ndsl_log.info(f"GPU vendor detected: {_GPU_HARDWARE_DEFAULTS.vendor}")

return _GPU_HARDWARE_DEFAULTS
Loading