Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/tl_templates/cuda/threadblock_swizzle.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

namespace tl {

template <int panel_width, int offset> TL_DEVICE dim3 rasterization2DRow() {
template <int panel_width, int offset = 0> TL_DEVICE dim3 rasterization2DRow() {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

rg -nP --type=cpp --type=cu --type=h -C3 'rasterization2DRow\s*<'

Repository: tile-ai/tilescale

Length of output: 87


🏁 Script executed:

rg -nP --type=cpp --type=h -C3 'rasterization2DRow\s*<'

Repository: tile-ai/tilescale

Length of output: 43


🏁 Script executed:

find . -name "threadblock_swizzle.h" -type f 2>/dev/null | head -20

Repository: tile-ai/tilescale

Length of output: 151


🏁 Script executed:

cat -n ./src/tl_templates/cuda/threadblock_swizzle.h | head -50

Repository: tile-ai/tilescale

Length of output: 2344


🏁 Script executed:

rg -n 'rasterization2DRow' --type cpp --type h -B2 -A2

Repository: tile-ai/tilescale

Length of output: 956


🏁 Script executed:

rg 'rasterization2DRow\s*<' --type cpp --type h -C5

Repository: tile-ai/tilescale

Length of output: 43


🏁 Script executed:

rg 'rasterization2DRow' --type cpp --type h | grep -v 'template' | head -20

Repository: tile-ai/tilescale

Length of output: 43


🏁 Script executed:

rg 'rasterization2D' --type cpp --type h -C3

Repository: tile-ai/tilescale

Length of output: 2416


🏁 Script executed:

rg 'rasterization2DRow' --type cpp --type h --glob '!threadblock_swizzle.h'

Repository: tile-ai/tilescale

Length of output: 43


🏁 Script executed:

cat -n ./src/tl_templates/hip/threadblock_swizzle.h

Repository: tile-ai/tilescale

Length of output: 2357


Address API inconsistency between CUDA and HIP implementations.

The default parameter offset = 0 was added to the CUDA version, but the HIP version in src/tl_templates/hip/threadblock_swizzle.h remains unchanged without the offset parameter. Both rasterization2DRow() and rasterization2DColumn() have diverged signatures. Update the HIP version to match the CUDA signature for API consistency, or document why they intentionally differ.

🤖 Prompt for AI Agents
In src/tl_templates/cuda/threadblock_swizzle.h around line 7 the CUDA template
signature template<int panel_width, int offset = 0> dim3 rasterization2DRow()
introduces a default offset parameter that the HIP implementation does not have;
to restore API consistency, update src/tl_templates/hip/threadblock_swizzle.h to
add the matching default template parameter (offset = 0) to the
rasterization2DRow() declaration (and make the same change for
rasterization2DColumn() if present) so both CUDA and HIP signatures match
exactly, keeping behavior identical and leaving a comment only if divergence is
intentional.

const unsigned int block_idx = blockIdx.x + blockIdx.y * gridDim.x;
const unsigned int grid_size = gridDim.x * gridDim.y;
const unsigned int panel_size = panel_width * gridDim.x;
Expand All @@ -23,7 +23,8 @@ template <int panel_width, int offset> TL_DEVICE dim3 rasterization2DRow() {
return {col_idx, row_idx, blockIdx.z};
}

template <int panel_width, int offset> TL_DEVICE dim3 rasterization2DColumn() {
template <int panel_width, int offset = 0>
TL_DEVICE dim3 rasterization2DColumn() {
const unsigned int block_idx = blockIdx.x + blockIdx.y * gridDim.x;
const unsigned int grid_size = gridDim.x * gridDim.y;
const unsigned int panel_size = panel_width * gridDim.y;
Expand Down
29 changes: 14 additions & 15 deletions tilelang/language/overrides/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,10 @@ def tilelang_visit_for(self, node: doc.For) -> None: # pylint: disable=unused-a
"Expect the for loop to be one of the following: "
"range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
)
with self.var_table.with_frame():
with iter_val as iters:
self.eval_assign(target=node.target, source=iters, bind_value=tvm_tir_parser.bind_for_value)
self.visit_body(node.body)
with self.var_table.with_frame(), iter_val as iters:
self.eval_assign(
target=node.target, source=iters, bind_value=tvm_tir_parser.bind_for_value)
self.visit_body(node.body)
return

# Stepped inclusive serial: require positive integer step
Expand All @@ -192,16 +192,15 @@ def tilelang_visit_for(self, node: doc.For) -> None: # pylint: disable=unused-a
# Use tvm.tir.floordiv via builder ops from tilelang.tir.ir if available
# Avoid importing op wrappers; compute using arithmetic to keep it simple.
# We construct: T.ceildiv((end - start), step)
extent = T.ceildiv(end - start, step_val) # type: ignore[operator]
extent = T.ceildiv(end - start, step_val) # type: ignore[operator]

for_frame = T.serial(0, extent, annotations=annotations)
with self.var_table.with_frame():
with for_frame as t:
# Bind loop target as Let var: i = start + t * step
stepped_index = start + t * step_val # type: ignore[operator]
self.eval_assign(
target=node.target,
source=stepped_index,
bind_value=tvm_tir_parser.bind_assign_value,
)
self.visit_body(node.body)
with self.var_table.with_frame(), for_frame as t:
# Bind loop target as Let var: i = start + t * step
stepped_index = start + t * step_val # type: ignore[operator]
self.eval_assign(
target=node.target,
source=stepped_index,
bind_value=tvm_tir_parser.bind_assign_value,
)
self.visit_body(node.body)
Loading