Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions transformer_engine/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
endif()
endif()

# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures
# Process CMAKE_CUDA_ARCHITECTURES to separate standard, generic, and specific architectures.
# - NVTE_STANDARD_ARCHS: pre-Blackwell archs (e.g. 75, 80, 89, 90). Applied to all CUDA sources.
# - NVTE_GENERIC_ARCHS: Blackwell family heads (e.g. 100, 120). Applied to non-arch-specific sources only.
# - NVTE_SPECIFIC_ARCHS: Blackwell specific targets (e.g. 100a, 120f). Applied to arch-specific sources only.
set(NVTE_STANDARD_ARCHS)
set(NVTE_GENERIC_ARCHS)
set(NVTE_SPECIFIC_ARCHS)

Expand Down Expand Up @@ -79,6 +83,10 @@ if(NOT arch_120_index EQUAL -1)
endif()
endif()

# Move remaining standard (pre-Blackwell) architectures into NVTE_STANDARD_ARCHS.
# These are applied to all CUDA sources (both generic and arch-specific).
set(NVTE_STANDARD_ARCHS ${CMAKE_CUDA_ARCHITECTURES})

# cuDNN frontend API
set(CUDNN_FRONTEND_INCLUDE_DIR
"${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
Expand Down Expand Up @@ -192,9 +200,13 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s
${transformer_engine_cuda_sources}
${transformer_engine_cpp_sources})

# Set compile options for CUDA sources with generic architectures
# Set compile options for CUDA sources with generic architectures.
# These get standard archs (pre-Blackwell) + generic Blackwell family heads.
foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
set(arch_compile_options)
foreach(arch IN LISTS NVTE_STANDARD_ARCHS)
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
endforeach()
foreach(arch IN LISTS NVTE_GENERIC_ARCHS)
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
endforeach()
Expand All @@ -209,9 +221,14 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
endif()
endforeach()

# Set compile options for CUDA sources with specific architectures
# Set compile options for CUDA sources with arch-specific features.
# These get standard archs (pre-Blackwell) + Blackwell specific targets (a/f suffix).
# They must NOT get generic Blackwell archs, as they use family/arch-specific PTX features.
foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
set(arch_compile_options)
foreach(arch IN LISTS NVTE_STANDARD_ARCHS)
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
endforeach()
foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS)
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
endforeach()
Expand All @@ -232,6 +249,10 @@ list(APPEND transformer_engine_SOURCES
endif()

add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
# Disable CMake's automatic architecture flag injection.
# All architectures are handled explicitly via per-source COMPILE_OPTIONS
# using NVTE_STANDARD_ARCHS, NVTE_GENERIC_ARCHS, and NVTE_SPECIFIC_ARCHS above.
set_target_properties(transformer_engine PROPERTIES CUDA_ARCHITECTURES OFF)
target_include_directories(transformer_engine PUBLIC
"${CMAKE_CURRENT_SOURCE_DIR}/include")

Expand Down